diff --git a/configs/nlp/sbert_sentence_similarity.json b/configs/nlp/sbert_sentence_similarity.json
index 1e2bdef5..9320e0d7 100644
--- a/configs/nlp/sbert_sentence_similarity.json
+++ b/configs/nlp/sbert_sentence_similarity.json
@@ -2,7 +2,7 @@
     "framework": "pytorch",
     "task": "sentence-similarity",
     "preprocessor": {
-      "type": "bert-seq-cls-tokenizer-finetune",
+      "type": "sen-sim-tokenizer",
       "first_sequence": "sentence1",
       "second_sequence": "sentence2"
     },
diff --git a/modelscope/hub/utils/utils.py b/modelscope/hub/utils/utils.py
index 8f6e7483..fff88cca 100644
--- a/modelscope/hub/utils/utils.py
+++ b/modelscope/hub/utils/utils.py
@@ -4,7 +4,7 @@ from modelscope.hub.constants import (DEFAULT_MODELSCOPE_DOMAIN,
                                       DEFAULT_MODELSCOPE_GROUP,
                                       MODEL_ID_SEPARATOR,
                                       MODELSCOPE_URL_SCHEME)
-from modelscope.utils.utils import get_default_cache_dir
+from modelscope.utils.file_utils import get_default_cache_dir
 
 
 def model_id_to_group_owner_name(model_id):
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 215233fe..e0326baa 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -53,6 +53,10 @@ class TaskModels(object):
 class Heads(object):
     # nlp heads
     text_classification = 'text-classification'
+    # mlm
+    bert_mlm = 'bert-mlm'
+    # roberta mlm
+    roberta_mlm = 'roberta-mlm'
 
 
 class Pipelines(object):
@@ -137,7 +141,7 @@ class Trainers(object):
         Holds the standard trainer name to use for identifying different trainer.
     This should be used to register trainers.
 
-        For a general Trainer, you can use easynlp-trainer/ofa-trainer/sofa-trainer.
+        For a general Trainer, you can use easynlp-trainer/ofa-trainer.
         For a model specific Trainer, you can use ${ModelName}-${Task}-trainer.
     """
 
@@ -179,6 +183,8 @@ class Preprocessors(object):
     sbert_token_cls_tokenizer = 'sbert-token-cls-tokenizer'
     zero_shot_cls_tokenizer = 'zero-shot-cls-tokenizer'
     text_error_correction = 'text-error-correction'
+    word_segment_text_to_label_preprocessor = 'word-segment-text-to-label-preprocessor'
+    fill_mask = 'fill-mask'
 
     # audio preprocessor
     linear_aec_fbank = 'linear-aec-fbank'
@@ -204,7 +210,7 @@ class Metrics(object):
     # metric for image instance segmentation task
     image_ins_seg_coco_metric = 'image-ins-seg-coco-metric'
     # metrics for sequence classification task
-    seq_cls_metric = 'seq_cls_metric'
+    seq_cls_metric = 'seq-cls-metric'
     # metrics for token-classification task
     token_cls_metric = 'token-cls-metric'
     # metrics for text-generation task
diff --git a/modelscope/metrics/__init__.py b/modelscope/metrics/__init__.py
index c632a9bd..37f9bfec 100644
--- a/modelscope/metrics/__init__.py
+++ b/modelscope/metrics/__init__.py
@@ -13,6 +13,7 @@ if TYPE_CHECKING:
     from .image_portrait_enhancement_metric import ImagePortraitEnhancementMetric
     from .sequence_classification_metric import SequenceClassificationMetric
     from .text_generation_metric import TextGenerationMetric
+    from .token_classification_metric import TokenClassificationMetric
 
 else:
     _import_structure = {
@@ -26,6 +27,7 @@ else:
         ['ImagePortraitEnhancementMetric'],
         'sequence_classification_metric': ['SequenceClassificationMetric'],
         'text_generation_metric': ['TextGenerationMetric'],
+        'token_classification_metric': ['TokenClassificationMetric'],
     }
 
     import sys
diff --git a/modelscope/metrics/base.py b/modelscope/metrics/base.py
index 1b9db825..3a9d810f 100644
--- a/modelscope/metrics/base.py
+++ b/modelscope/metrics/base.py
@@ -10,6 +10,9 @@ class Metric(ABC):
     complex metrics for a specific task with or without other Metric subclasses.
     """
 
+    def __init__(self, trainer=None, *args, **kwargs):
+        self.trainer = trainer
+
     @abstractmethod
     def add(self, outputs: Dict, inputs: Dict):
         """ Append logits and labels within an eval loop.
diff --git a/modelscope/metrics/builder.py b/modelscope/metrics/builder.py
index 4df856f2..bd20d37b 100644
--- a/modelscope/metrics/builder.py
+++ b/modelscope/metrics/builder.py
@@ -20,7 +20,9 @@ class MetricKeys(object):
 task_default_metrics = {
     Tasks.image_segmentation: [Metrics.image_ins_seg_coco_metric],
     Tasks.sentence_similarity: [Metrics.seq_cls_metric],
+    Tasks.nli: [Metrics.seq_cls_metric],
     Tasks.sentiment_classification: [Metrics.seq_cls_metric],
+    Tasks.token_classification: [Metrics.token_cls_metric],
     Tasks.text_generation: [Metrics.text_gen_metric],
     Tasks.image_denoising: [Metrics.image_denoise_metric],
     Tasks.image_color_enhancement: [Metrics.image_color_enhance_metric],
diff --git a/modelscope/metrics/sequence_classification_metric.py b/modelscope/metrics/sequence_classification_metric.py
index dabdb725..04b0ee81 100644
--- a/modelscope/metrics/sequence_classification_metric.py
+++ b/modelscope/metrics/sequence_classification_metric.py
@@ -17,14 +17,14 @@ class SequenceClassificationMetric(Metric):
     """The metric computation class for sequence classification classes.
     """
 
-    label_name = 'labels'
-
-    def __init__(self):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
         self.preds = []
         self.labels = []
 
     def add(self, outputs: Dict, inputs: Dict):
-        ground_truths = inputs[self.label_name]
+        label_name = OutputKeys.LABEL if OutputKeys.LABEL in inputs else OutputKeys.LABELS
+        ground_truths = inputs[label_name]
         eval_results = outputs[OutputKeys.LOGITS]
         self.preds.append(
             torch_nested_numpify(torch_nested_detach(eval_results)))
diff --git a/modelscope/metrics/token_classification_metric.py b/modelscope/metrics/token_classification_metric.py
new file mode 100644
index 00000000..8606148e
--- /dev/null
+++ b/modelscope/metrics/token_classification_metric.py
@@ -0,0 +1,123 @@
+import importlib
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+
+from modelscope.outputs import OutputKeys
+from ..metainfo import Metrics
+from ..utils.registry import default_group
+from ..utils.tensor_utils import torch_nested_detach, torch_nested_numpify
+from .base import Metric
+from .builder import METRICS, MetricKeys
+
+
+@METRICS.register_module(
+    group_key=default_group, module_name=Metrics.token_cls_metric)
+class TokenClassificationMetric(Metric):
+    """
+    The metric computation class for token-classification task.
+    Args:
+        return_entity_level_metrics (bool, *optional*):
+            Whether to return every label's detail metrics, default False.
+    """
+
+    def add(self, outputs: Dict, inputs: Dict):
+        label_name = OutputKeys.LABEL if OutputKeys.LABEL in inputs else OutputKeys.LABELS
+        ground_truths = inputs[label_name]
+        eval_results = outputs[OutputKeys.LOGITS]
+        self.preds.append(
+            torch_nested_numpify(torch_nested_detach(eval_results)))
+        self.labels.append(
+            torch_nested_numpify(torch_nested_detach(ground_truths)))
+
+    def __init__(self, return_entity_level_metrics=False, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.return_entity_level_metrics = return_entity_level_metrics
+        self.preds = []
+        self.labels = []
+
+    def evaluate(self):
+        self.id2label = {
+            id: label
+            for label, id in self.trainer.label2id.items()
+        }
+        self.preds = np.concatenate(self.preds, axis=0)
+        self.labels = np.concatenate(self.labels, axis=0)
+        predictions = np.argmax(self.preds, axis=-1)
+
+        true_predictions = [[
+            self.id2label[p] for (p, lb) in zip(prediction, label)
+            if lb != -100
+        ] for prediction, label in zip(predictions, self.labels)]
+        true_labels = [[
+            self.id2label[lb] for (p, lb) in zip(prediction, label)
+            if lb != -100
+        ] for prediction, label in zip(predictions, self.labels)]
+
+        results = self._compute(
+            predictions=true_predictions, references=true_labels)
+        if self.return_entity_level_metrics:
+            final_results = {}
+            for key, value in results.items():
+                if isinstance(value, dict):
+                    for n, v in value.items():
+                        final_results[f'{key}_{n}'] = v
+                else:
+                    final_results[key] = value
+            return final_results
+        else:
+            return {
+                MetricKeys.PRECISION: results[MetricKeys.PRECISION],
+                MetricKeys.RECALL: results[MetricKeys.RECALL],
+                MetricKeys.F1: results[MetricKeys.F1],
+                MetricKeys.ACCURACY: results[MetricKeys.ACCURACY],
+            }
+
+    @staticmethod
+    def _compute(
+        predictions,
+        references,
+        suffix: bool = False,
+        scheme: Optional[str] = None,
+        mode: Optional[str] = None,
+        sample_weight: Optional[List[int]] = None,
+        zero_division: Union[str, int] = 'warn',
+    ):
+        from seqeval.metrics import accuracy_score, classification_report
+        if scheme is not None:
+            try:
+                scheme_module = importlib.import_module('seqeval.scheme')
+                scheme = getattr(scheme_module, scheme)
+            except AttributeError:
+                raise ValueError(
+                    f'Scheme should be one of [IOB1, IOB2, IOE1, IOE2, IOBES, BILOU], got {scheme}'
+                )
+        report = classification_report(
+            y_true=references,
+            y_pred=predictions,
+            suffix=suffix,
+            output_dict=True,
+            scheme=scheme,
+            mode=mode,
+            sample_weight=sample_weight,
+            zero_division=zero_division,
+        )
+        report.pop('macro avg')
+        report.pop('weighted avg')
+        overall_score = report.pop('micro avg')
+
+        scores = {
+            type_name: {
+                MetricKeys.PRECISION: score['precision'],
+                MetricKeys.RECALL: score['recall'],
+                MetricKeys.F1: score['f1-score'],
+                'number': score['support'],
+            }
+            for type_name, score in report.items()
+        }
+        scores[MetricKeys.PRECISION] = overall_score['precision']
+        scores[MetricKeys.RECALL] = overall_score['recall']
+        scores[MetricKeys.F1] = overall_score['f1-score']
+        scores[MetricKeys.ACCURACY] = accuracy_score(
+            y_true=references, y_pred=predictions)
+        return scores
diff --git a/modelscope/models/base/base_model.py b/modelscope/models/base/base_model.py
index fd556dd4..3b596769 100644
--- a/modelscope/models/base/base_model.py
+++ b/modelscope/models/base/base_model.py
@@ -10,6 +10,8 @@ from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models.builder import build_model
 from modelscope.utils.config import Config
 from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile
+from modelscope.utils.file_utils import func_receive_dict_inputs
+from modelscope.utils.hub import parse_label_mapping
 from modelscope.utils.logger import get_logger
 
 logger = get_logger()
@@ -69,6 +71,7 @@ class Model(ABC):
     def from_pretrained(cls,
                         model_name_or_path: str,
                         revision: Optional[str] = DEFAULT_MODEL_REVISION,
+                        cfg_dict: Config = None,
                         *model_args,
                         **kwargs):
         """ Instantiate a model from local directory or remote model repo. Note
@@ -87,25 +90,25 @@ class Model(ABC):
                 )
             local_model_dir = snapshot_download(model_name_or_path, revision)
         logger.info(f'initialize model from {local_model_dir}')
-        cfg = Config.from_file(
-            osp.join(local_model_dir, ModelFile.CONFIGURATION))
+        if cfg_dict is not None:
+            cfg = cfg_dict
+        else:
+            cfg = Config.from_file(
+                osp.join(local_model_dir, ModelFile.CONFIGURATION))
         task_name = cfg.task
         model_cfg = cfg.model
-        assert hasattr(
-            cfg, 'pipeline'), 'pipeline config is missing from config file.'
-        pipeline_cfg = cfg.pipeline
         # TODO @wenmeng.zwm may should manually initialize model after model building
 
         if hasattr(model_cfg, 'model_type') and not hasattr(model_cfg, 'type'):
             model_cfg.type = model_cfg.model_type
 
         model_cfg.model_dir = local_model_dir
-
         for k, v in kwargs.items():
             model_cfg[k] = v
         model = build_model(
             model_cfg, task_name=task_name, default_args=kwargs)
 
         # dynamically add pipeline info to model for pipeline inference
-        model.pipeline = pipeline_cfg
+        if hasattr(cfg, 'pipeline'):
+            model.pipeline = cfg.pipeline
         return model
diff --git a/modelscope/models/base/base_torch_model.py b/modelscope/models/base/base_torch_model.py
index 52d4460c..cfc88721 100644
--- a/modelscope/models/base/base_torch_model.py
+++ b/modelscope/models/base/base_torch_model.py
@@ -5,6 +5,7 @@ from typing import Any, Dict, Optional, Union
 import torch
 from torch import nn
 
+from modelscope.utils.file_utils import func_receive_dict_inputs
 from modelscope.utils.logger import get_logger
 from .base_model import Model
 
@@ -20,6 +21,13 @@ class TorchModel(Model, torch.nn.Module):
         super().__init__(model_dir, *args, **kwargs)
         torch.nn.Module.__init__(self)
 
+    def __call__(self, input: Dict[str,
+                                   torch.Tensor]) -> Dict[str, torch.Tensor]:
+        if func_receive_dict_inputs(self.forward):
+            return self.postprocess(self.forward(input))
+        else:
+            return self.postprocess(self.forward(**input))
+
     def forward(self, inputs: Dict[str,
                                    torch.Tensor]) -> Dict[str, torch.Tensor]:
         raise NotImplementedError
@@ -50,6 +58,3 @@ class TorchModel(Model, torch.nn.Module):
         elif isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
-
-    def compute_loss(self, outputs: Dict[str, Any], labels):
-        raise NotImplementedError()
diff --git a/modelscope/models/nlp/__init__.py b/modelscope/models/nlp/__init__.py
index f2219b0e..24e65ef1 100644
--- a/modelscope/models/nlp/__init__.py
+++ b/modelscope/models/nlp/__init__.py
@@ -4,32 +4,26 @@ from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
-    from .backbones import (SbertModel, SpaceGenerator, SpaceModelBase,
-                            GPT3Model)
+    from .backbones import SbertModel
     from .heads import SequenceClassificationHead
     from .bert_for_sequence_classification import BertForSequenceClassification
     from .csanmt_for_translation import CsanmtForTranslation
     from .masked_language import (StructBertForMaskedLM, VecoForMaskedLM,
                                   BertForMaskedLM)
     from .nncrf_for_named_entity_recognition import TransformerCRFForNamedEntityRecognition
-    from .palm_for_text_generation import PalmForTextGeneration
-    from .sbert_for_nli import SbertForNLI
-    from .sbert_for_sentence_similarity import SbertForSentenceSimilarity
-    from .sbert_for_sentiment_classification import SbertForSentimentClassification
-    from .sbert_for_token_classification import SbertForTokenClassification
-    from .sbert_for_zero_shot_classification import SbertForZeroShotClassification
-    from .sequence_classification import SequenceClassificationModel
-    from .space_for_dialog_intent_prediction import SpaceForDialogIntent
-    from .space_for_dialog_modeling import SpaceForDialogModeling
-    from .space_for_dialog_state_tracking import SpaceForDialogStateTracking
-    from .task_model import SingleBackboneTaskModelBase
+    from .palm_v2 import PalmForTextGeneration
+    from .token_classification import SbertForTokenClassification
+    from .sequence_classification import VecoForSequenceClassification, SbertForSequenceClassification
+    from .space import SpaceForDialogIntent
+    from .space import SpaceForDialogModeling
+    from .space import SpaceForDialogStateTracking
+    from .task_models.task_model import SingleBackboneTaskModelBase
     from .bart_for_text_error_correction import BartForTextErrorCorrection
-    from .gpt3_for_text_generation import GPT3ForTextGeneration
+    from .gpt3 import GPT3ForTextGeneration
 
 else:
     _import_structure = {
-        'backbones':
-        ['SbertModel', 'SpaceGenerator', 'SpaceModelBase', 'GPT3Model'],
+        'backbones': ['SbertModel'],
         'heads': ['SequenceClassificationHead'],
         'csanmt_for_translation': ['CsanmtForTranslation'],
         'bert_for_sequence_classification': ['BertForSequenceClassification'],
@@ -37,21 +31,17 @@ else:
         ['StructBertForMaskedLM', 'VecoForMaskedLM', 'BertForMaskedLM'],
         'nncrf_for_named_entity_recognition':
         ['TransformerCRFForNamedEntityRecognition'],
-        'palm_for_text_generation': ['PalmForTextGeneration'],
-        'sbert_for_nli': ['SbertForNLI'],
-        'sbert_for_sentence_similarity': ['SbertForSentenceSimilarity'],
-        'sbert_for_sentiment_classification':
-        ['SbertForSentimentClassification'],
-        'sbert_for_token_classification': ['SbertForTokenClassification'],
-        'sbert_for_zero_shot_classification':
-        ['SbertForZeroShotClassification'],
-        'sequence_classification': ['SequenceClassificationModel'],
-        'space_for_dialog_intent_prediction': ['SpaceForDialogIntent'],
-        'space_for_dialog_modeling': ['SpaceForDialogModeling'],
-        'space_for_dialog_state_tracking': ['SpaceForDialogStateTracking'],
+        'palm_v2': ['PalmForTextGeneration'],
+        'token_classification': ['SbertForTokenClassification'],
+        'sequence_classification':
+        ['VecoForSequenceClassification', 'SbertForSequenceClassification'],
+        'space': [
+            'SpaceForDialogIntent', 'SpaceForDialogModeling',
+            'SpaceForDialogStateTracking'
+        ],
         'task_model': ['SingleBackboneTaskModelBase'],
         'bart_for_text_error_correction': ['BartForTextErrorCorrection'],
-        'gpt3_for_text_generation': ['GPT3ForTextGeneration'],
+        'gpt3': ['GPT3ForTextGeneration'],
     }
 
     import sys
diff --git a/modelscope/models/nlp/backbones/__init__.py b/modelscope/models/nlp/backbones/__init__.py
index ffe8ac05..749cf995 100644
--- a/modelscope/models/nlp/backbones/__init__.py
+++ b/modelscope/models/nlp/backbones/__init__.py
@@ -4,14 +4,10 @@ from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
-    from .space import SpaceGenerator, SpaceModelBase
     from .structbert import SbertModel
-    from .gpt3 import GPT3Model
 else:
     _import_structure = {
-        'space': ['SpaceGenerator', 'SpaceModelBase'],
         'structbert': ['SbertModel'],
-        'gpt3': ['GPT3Model']
     }
 
     import sys
diff --git a/modelscope/models/nlp/backbones/space/__init__.py b/modelscope/models/nlp/backbones/space/__init__.py
deleted file mode 100644
index a2be83ef..00000000
--- a/modelscope/models/nlp/backbones/space/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from .model.generator import Generator as SpaceGenerator
-from .model.model_base import SpaceModelBase
diff --git a/modelscope/models/nlp/backbones/space/model/__init__.py b/modelscope/models/nlp/backbones/space/model/__init__.py
deleted file mode 100644
index 7e1b5264..00000000
--- a/modelscope/models/nlp/backbones/space/model/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from .gen_unified_transformer import GenUnifiedTransformer
-from .intent_unified_transformer import IntentUnifiedTransformer
-from .unified_transformer import UnifiedTransformer
diff --git a/modelscope/models/nlp/backbones/structbert.py b/modelscope/models/nlp/backbones/structbert.py
new file mode 100644
index 00000000..125db040
--- /dev/null
+++ b/modelscope/models/nlp/backbones/structbert.py
@@ -0,0 +1,54 @@
+from transformers import PreTrainedModel
+
+from modelscope.metainfo import Models
+from modelscope.models.base import TorchModel
+from modelscope.models.builder import BACKBONES
+from modelscope.models.nlp.structbert import SbertConfig
+from modelscope.models.nlp.structbert import SbertModel as SbertModelTransform
+from modelscope.utils.constant import Fields
+from modelscope.utils.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+@BACKBONES.register_module(Fields.nlp, module_name=Models.structbert)
+class SbertModel(TorchModel, SbertModelTransform):
+
+    def __init__(self, model_dir=None, add_pooling_layer=True, **config):
+        """
+        Args:
+            model_dir (str, optional): The model checkpoint directory. Defaults to None.
+            add_pooling_layer (bool, optional): to decide if pool the output from hidden layer. Defaults to True.
+        """
+        config = SbertConfig(**config)
+        super().__init__(model_dir)
+        self.config = config
+        SbertModelTransform.__init__(self, config, add_pooling_layer)
+
+    def extract_sequence_outputs(self, outputs):
+        return outputs['last_hidden_state']
+
+    def extract_pooled_outputs(self, outputs):
+        return outputs['pooler_output']
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        return SbertModelTransform.forward(
+            self, input_ids, attention_mask, token_type_ids, position_ids,
+            head_mask, inputs_embeds, encoder_hidden_states,
+            encoder_attention_mask, past_key_values, use_cache,
+            output_attentions, output_hidden_states, return_dict)
diff --git a/modelscope/models/nlp/backbones/structbert/__init__.py b/modelscope/models/nlp/backbones/structbert/__init__.py
deleted file mode 100644
index 1d147730..00000000
--- a/modelscope/models/nlp/backbones/structbert/__init__.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-from typing import TYPE_CHECKING
-
-from modelscope.utils.import_utils import LazyImportModule
-
-if TYPE_CHECKING:
-    from .modeling_sbert import SbertModel
-else:
-    _import_structure = {'modeling_sbert': ['SbertModel']}
-
-    import sys
-
-    sys.modules[__name__] = LazyImportModule(
-        __name__,
-        globals()['__file__'],
-        _import_structure,
-        module_spec=__spec__,
-        extra_objects={},
-    )
diff --git a/modelscope/models/nlp/backbones/structbert/modeling_sbert.py b/modelscope/models/nlp/backbones/structbert/modeling_sbert.py
deleted file mode 100644
index 2e67a652..00000000
--- a/modelscope/models/nlp/backbones/structbert/modeling_sbert.py
+++ /dev/null
@@ -1,815 +0,0 @@
-import math
-from dataclasses import dataclass
-from typing import Optional, Tuple, Union
-
-import torch
-import torch.utils.checkpoint
-from packaging import version
-from torch import nn
-from transformers import PreTrainedModel
-from transformers.activations import ACT2FN
-from transformers.modeling_outputs import (
-    BaseModelOutputWithPastAndCrossAttentions,
-    BaseModelOutputWithPoolingAndCrossAttentions, ModelOutput)
-from transformers.modeling_utils import (apply_chunking_to_forward,
-                                         find_pruneable_heads_and_indices,
-                                         prune_linear_layer)
-
-from modelscope.metainfo import Models
-from modelscope.models.base import TorchModel
-from modelscope.models.builder import BACKBONES
-from modelscope.utils.constant import Fields
-from modelscope.utils.logger import get_logger
-from .configuration_sbert import SbertConfig
-
-logger = get_logger(__name__)
-
-
-@BACKBONES.register_module(Fields.nlp, module_name=Models.structbert)
-class SbertModel(TorchModel, PreTrainedModel):
-    """
-
-    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
-    cross-attention is added between the self-attention layers, following the architecture described in `Attention is
-    all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
-    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
-
-    To behave as an decoder the model needs to be initialized with the :obj:`is_decoder` argument of the configuration
-    set to :obj:`True`. To be used in a Seq2Seq model, the model needs to initialized with both :obj:`is_decoder`
-    argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
-    input to the forward pass.
-    """
-
-    def __init__(self, model_dir=None, add_pooling_layer=True, **config):
-        """
-        Args:
-            model_dir (str, optional): The model checkpoint directory. Defaults to None.
-            add_pooling_layer (bool, optional): to decide if pool the output from hidden layer. Defaults to True.
-        """
-        config = SbertConfig(**config)
-        super().__init__(model_dir)
-        self.config = config
-
-        self.embeddings = SbertEmbeddings(config)
-        self.encoder = SbertEncoder(config)
-
-        self.pooler = SbertPooler(config) if add_pooling_layer else None
-        self.init_weights()
-
-    def get_input_embeddings(self):
-        return self.embeddings.word_embeddings
-
-    def set_input_embeddings(self, value):
-        self.embeddings.word_embeddings = value
-
-    def _prune_heads(self, heads_to_prune):
-        """
-        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
-        class PreTrainedModel
-        """
-        for layer, heads in heads_to_prune.items():
-            self.encoder.layer[layer].attention.prune_heads(heads)
-
-    def forward(self,
-                input_ids=None,
-                attention_mask=None,
-                token_type_ids=None,
-                position_ids=None,
-                head_mask=None,
-                inputs_embeds=None,
-                encoder_hidden_states=None,
-                encoder_attention_mask=None,
-                past_key_values=None,
-                use_cache=None,
-                output_attentions=None,
-                output_hidden_states=None,
-                return_dict=None,
-                **kwargs):
-        r"""
-        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`
-        , `optional`):
-            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
-            the model is configured as a decoder.
-        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
-            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers`
-        with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads,
-        sequence_length - 1, embed_size_per_head)`):
-            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
-
-            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
-            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
-            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
-        use_cache (:obj:`bool`, `optional`):
-            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
-            decoding (see :obj:`past_key_values`).
-        """
-
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else
-            self.config.output_hidden_states)
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if self.config.is_decoder:
-            use_cache = use_cache if use_cache is not None else self.config.use_cache
-        else:
-            use_cache = False
-
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError(
-                'You cannot specify both input_ids and inputs_embeds at the same time'
-            )
-        elif input_ids is not None:
-            input_shape = input_ids.size()
-        elif inputs_embeds is not None:
-            input_shape = inputs_embeds.size()[:-1]
-        else:
-            raise ValueError(
-                'You have to specify either input_ids or inputs_embeds')
-
-        batch_size, seq_length = input_shape
-        device = input_ids.device if input_ids is not None else inputs_embeds.device
-
-        # past_key_values_length
-        past_key_values_length = past_key_values[0][0].shape[
-            2] if past_key_values is not None else 0
-
-        if attention_mask is None:
-            attention_mask = torch.ones(
-                ((batch_size, seq_length + past_key_values_length)),
-                device=device)
-
-        if token_type_ids is None:
-            if hasattr(self.embeddings, 'token_type_ids'):
-                buffered_token_type_ids = self.embeddings.token_type_ids[:, :
-                                                                         seq_length]
-                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(
-                    batch_size, seq_length)
-                token_type_ids = buffered_token_type_ids_expanded
-            else:
-                token_type_ids = torch.zeros(
-                    input_shape, dtype=torch.long, device=device)
-
-        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
-        # ourselves in which case we just need to make it broadcastable to all heads.
-        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(
-            attention_mask, input_shape, device)
-
-        # If a 2D or 3D attention mask is provided for the cross-attention
-        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
-        if self.config.is_decoder and encoder_hidden_states is not None:
-            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size(
-            )
-            encoder_hidden_shape = (encoder_batch_size,
-                                    encoder_sequence_length)
-            if encoder_attention_mask is None:
-                encoder_attention_mask = torch.ones(
-                    encoder_hidden_shape, device=device)
-            encoder_extended_attention_mask = self.invert_attention_mask(
-                encoder_attention_mask)
-        else:
-            encoder_extended_attention_mask = None
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
-        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        head_mask = self.get_head_mask(head_mask,
-                                       self.config.num_hidden_layers)
-
-        embedding_output, orignal_embeds = self.embeddings(
-            input_ids=input_ids,
-            position_ids=position_ids,
-            token_type_ids=token_type_ids,
-            inputs_embeds=inputs_embeds,
-            past_key_values_length=past_key_values_length,
-            return_inputs_embeds=True,
-        )
-        encoder_outputs = self.encoder(
-            embedding_output,
-            attention_mask=extended_attention_mask,
-            head_mask=head_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_extended_attention_mask,
-            past_key_values=past_key_values,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        sequence_output = encoder_outputs[0]
-        pooled_output = self.pooler(
-            sequence_output) if self.pooler is not None else None
-
-        if not return_dict:
-            return (sequence_output,
-                    pooled_output) + encoder_outputs[1:] + (orignal_embeds, )
-
-        return BaseModelOutputWithPoolingAndCrossAttentionsWithEmbedding(
-            last_hidden_state=sequence_output,
-            pooler_output=pooled_output,
-            past_key_values=encoder_outputs.past_key_values,
-            hidden_states=encoder_outputs.hidden_states,
-            attentions=encoder_outputs.attentions,
-            cross_attentions=encoder_outputs.cross_attentions,
-            embedding_output=orignal_embeds)
-
-    def extract_sequence_outputs(self, outputs):
-        return outputs['last_hidden_state']
-
-    def extract_pooled_outputs(self, outputs):
-        return outputs['pooler_output']
-
-
-class SbertEmbeddings(nn.Module):
-    """Construct the embeddings from word, position and token_type embeddings."""
-
-    def __init__(self, config):
-        super().__init__()
-        self.word_embeddings = nn.Embedding(
-            config.vocab_size,
-            config.hidden_size,
-            padding_idx=config.pad_token_id)
-        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
-                                                config.hidden_size)
-        self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
-                                                  config.hidden_size)
-
-        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
-        # any TensorFlow checkpoint file
-        self.LayerNorm = nn.LayerNorm(
-            config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.position_embedding_type = getattr(config,
-                                               'position_embedding_type',
-                                               'absolute')
-        self.register_buffer(
-            'position_ids',
-            torch.arange(config.max_position_embeddings).expand((1, -1)))
-        if version.parse(torch.__version__) > version.parse('1.6.0'):
-            self.register_buffer(
-                'token_type_ids',
-                torch.zeros(
-                    self.position_ids.size(),
-                    dtype=torch.long,
-                    device=self.position_ids.device),
-                persistent=False,
-            )
-
-    def forward(self,
-                input_ids=None,
-                token_type_ids=None,
-                position_ids=None,
-                inputs_embeds=None,
-                past_key_values_length=0,
-                return_inputs_embeds=False):
-        if input_ids is not None:
-            input_shape = input_ids.size()
-        else:
-            input_shape = inputs_embeds.size()[:-1]
-
-        seq_length = input_shape[1]
-
-        if position_ids is None:
-            position_ids = self.position_ids[:,
-                                             past_key_values_length:seq_length
-                                             + past_key_values_length]
-
-        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
-        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids
-        # issue #5664
-        if token_type_ids is None:
-            if hasattr(self, 'token_type_ids'):
-                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
-                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(
-                    input_shape[0], seq_length)
-                token_type_ids = buffered_token_type_ids_expanded
-            else:
-                token_type_ids = torch.zeros(
-                    input_shape,
-                    dtype=torch.long,
-                    device=self.position_ids.device)
-
-        if inputs_embeds is None:
-            inputs_embeds = self.word_embeddings(input_ids)
-        token_type_embeddings = self.token_type_embeddings(token_type_ids)
-
-        embeddings = inputs_embeds + token_type_embeddings
-        if self.position_embedding_type == 'absolute':
-            position_embeddings = self.position_embeddings(position_ids)
-            embeddings += position_embeddings
-        embeddings = self.LayerNorm(embeddings)
-        embeddings = self.dropout(embeddings)
-        if not return_inputs_embeds:
-            return embeddings
-        else:
-            return embeddings, inputs_embeds
-
-
-class SbertSelfAttention(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(
-                config, 'embedding_size'):
-            raise ValueError(
-                f'The hidden size ({config.hidden_size}) is not a multiple of the number of attention '
-                f'heads ({config.num_attention_heads})')
-
-        self.num_attention_heads = config.num_attention_heads
-        self.attention_head_size = int(config.hidden_size
-                                       / config.num_attention_heads)
-        self.all_head_size = self.num_attention_heads * self.attention_head_size
-
-        self.query = nn.Linear(config.hidden_size, self.all_head_size)
-        self.key = nn.Linear(config.hidden_size, self.all_head_size)
-        self.value = nn.Linear(config.hidden_size, self.all_head_size)
-
-        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
-        self.position_embedding_type = getattr(config,
-                                               'position_embedding_type',
-                                               'absolute')
-        if self.position_embedding_type == 'relative_key' or self.position_embedding_type == 'relative_key_query':
-            self.max_position_embeddings = config.max_position_embeddings
-            self.distance_embedding = nn.Embedding(
-                2 * config.max_position_embeddings - 1,
-                self.attention_head_size)
-
-        self.is_decoder = config.is_decoder
-
-    def transpose_for_scores(self, x):
-        new_x_shape = x.size()[:-1] + (self.num_attention_heads,
-                                       self.attention_head_size)
-        x = x.view(*new_x_shape)
-        return x.permute(0, 2, 1, 3)
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_value=None,
-        output_attentions=False,
-    ):
-        mixed_query_layer = self.query(hidden_states)
-
-        # If this is instantiated as a cross-attention module, the keys
-        # and values come from an encoder; the attention mask needs to be
-        # such that the encoder's padding tokens are not attended to.
-        is_cross_attention = encoder_hidden_states is not None
-
-        if is_cross_attention and past_key_value is not None:
-            # reuse k,v, cross_attentions
-            key_layer = past_key_value[0]
-            value_layer = past_key_value[1]
-            attention_mask = encoder_attention_mask
-        elif is_cross_attention:
-            key_layer = self.transpose_for_scores(
-                self.key(encoder_hidden_states))
-            value_layer = self.transpose_for_scores(
-                self.value(encoder_hidden_states))
-            attention_mask = encoder_attention_mask
-        elif past_key_value is not None:
-            key_layer = self.transpose_for_scores(self.key(hidden_states))
-            value_layer = self.transpose_for_scores(self.value(hidden_states))
-            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
-            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
-        else:
-            key_layer = self.transpose_for_scores(self.key(hidden_states))
-            value_layer = self.transpose_for_scores(self.value(hidden_states))
-
-        query_layer = self.transpose_for_scores(mixed_query_layer)
-
-        if self.is_decoder:
-            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
-            # Further calls to cross_attention layer can then reuse all cross-attention
-            # key/value_states (first "if" case)
-            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
-            # all previous decoder key/value_states. Further calls to uni-directional self-attention
-            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
-            # if encoder bi-directional self-attention `past_key_value` is always `None`
-            past_key_value = (key_layer, value_layer)
-
-        # Take the dot product between "query" and "key" to get the raw attention scores.
-        attention_scores = torch.matmul(query_layer,
-                                        key_layer.transpose(-1, -2))
-
-        if self.position_embedding_type == 'relative_key' or self.position_embedding_type == 'relative_key_query':
-            seq_length = hidden_states.size()[1]
-            position_ids_l = torch.arange(
-                seq_length, dtype=torch.long,
-                device=hidden_states.device).view(-1, 1)
-            position_ids_r = torch.arange(
-                seq_length, dtype=torch.long,
-                device=hidden_states.device).view(1, -1)
-            distance = position_ids_l - position_ids_r
-            positional_embedding = self.distance_embedding(
-                distance + self.max_position_embeddings - 1)
-            positional_embedding = positional_embedding.to(
-                dtype=query_layer.dtype)  # fp16 compatibility
-
-            if self.position_embedding_type == 'relative_key':
-                relative_position_scores = torch.einsum(
-                    'bhld,lrd->bhlr', query_layer, positional_embedding)
-                attention_scores = attention_scores + relative_position_scores
-            elif self.position_embedding_type == 'relative_key_query':
-                relative_position_scores_query = torch.einsum(
-                    'bhld,lrd->bhlr', query_layer, positional_embedding)
-                relative_position_scores_key = torch.einsum(
-                    'bhrd,lrd->bhlr', key_layer, positional_embedding)
-                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
-
-        attention_scores = attention_scores / math.sqrt(
-            self.attention_head_size)
-        if attention_mask is not None:
-            # Apply the attention mask is (precomputed for all layers in SbertModel forward() function)
-            attention_scores = attention_scores + attention_mask
-
-        # Normalize the attention scores to probabilities.
-        attention_probs = nn.Softmax(dim=-1)(attention_scores)
-
-        # This is actually dropping out entire tokens to attend to, which might
-        # seem a bit unusual, but is taken from the original Transformer paper.
-        attention_probs = self.dropout(attention_probs)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            attention_probs = attention_probs * head_mask
-
-        context_layer = torch.matmul(attention_probs, value_layer)
-
-        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
-        new_context_layer_shape = context_layer.size()[:-2] + (
-            self.all_head_size, )
-        context_layer = context_layer.view(*new_context_layer_shape)
-
-        outputs = (context_layer,
-                   attention_probs) if output_attentions else (context_layer, )
-
-        if self.is_decoder:
-            outputs = outputs + (past_key_value, )
-        return outputs
-
-
-class SbertSelfOutput(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.LayerNorm = nn.LayerNorm(
-            config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-    def forward(self, hidden_states, input_tensor):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states + input_tensor)
-        return hidden_states
-
-
-class SbertAttention(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.self = SbertSelfAttention(config)
-        self.output = SbertSelfOutput(config)
-        self.pruned_heads = set()
-
-    def prune_heads(self, heads):
-        if len(heads) == 0:
-            return
-        heads, index = find_pruneable_heads_and_indices(
-            heads, self.self.num_attention_heads,
-            self.self.attention_head_size, self.pruned_heads)
-
-        # Prune linear layers
-        self.self.query = prune_linear_layer(self.self.query, index)
-        self.self.key = prune_linear_layer(self.self.key, index)
-        self.self.value = prune_linear_layer(self.self.value, index)
-        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
-
-        # Update hyper params and store pruned heads
-        self.self.num_attention_heads = self.self.num_attention_heads - len(
-            heads)
-        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
-        self.pruned_heads = self.pruned_heads.union(heads)
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_value=None,
-        output_attentions=False,
-    ):
-        self_outputs = self.self(
-            hidden_states,
-            attention_mask,
-            head_mask,
-            encoder_hidden_states,
-            encoder_attention_mask,
-            past_key_value,
-            output_attentions,
-        )
-        attention_output = self.output(self_outputs[0], hidden_states)
-        outputs = (attention_output,
-                   ) + self_outputs[1:]  # add attentions if we output them
-        return outputs
-
-
-class SbertIntermediate(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
-        if isinstance(config.hidden_act, str):
-            self.intermediate_act_fn = ACT2FN[config.hidden_act]
-        else:
-            self.intermediate_act_fn = config.hidden_act
-
-    def forward(self, hidden_states):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.intermediate_act_fn(hidden_states)
-        return hidden_states
-
-
-class SbertOutput(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
-        self.LayerNorm = nn.LayerNorm(
-            config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-    def forward(self, hidden_states, input_tensor):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states + input_tensor)
-        return hidden_states
-
-
-class SbertLayer(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.chunk_size_feed_forward = config.chunk_size_feed_forward
-        self.seq_len_dim = 1
-        self.attention = SbertAttention(config)
-        self.is_decoder = config.is_decoder
-        self.add_cross_attention = config.add_cross_attention
-        if self.add_cross_attention:
-            if not self.is_decoder:
-                raise ValueError(
-                    f'{self} should be used as a decoder model if cross attention is added'
-                )
-            self.crossattention = SbertAttention(config)
-        self.intermediate = SbertIntermediate(config)
-        self.output = SbertOutput(config)
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_value=None,
-        output_attentions=False,
-    ):
-        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
-        self_attn_past_key_value = past_key_value[:
-                                                  2] if past_key_value is not None else None
-        self_attention_outputs = self.attention(
-            hidden_states,
-            attention_mask,
-            head_mask,
-            output_attentions=output_attentions,
-            past_key_value=self_attn_past_key_value,
-        )
-        attention_output = self_attention_outputs[0]
-
-        # if decoder, the last output is tuple of self-attn cache
-        if self.is_decoder:
-            outputs = self_attention_outputs[1:-1]
-            present_key_value = self_attention_outputs[-1]
-        else:
-            outputs = self_attention_outputs[
-                1:]  # add self attentions if we output attention weights
-
-        cross_attn_present_key_value = None
-        if self.is_decoder and encoder_hidden_states is not None:
-            if not hasattr(self, 'crossattention'):
-                raise ValueError(
-                    f'If `encoder_hidden_states` are passed, {self} has to be instantiated'
-                    f'with cross-attention layers by setting `config.add_cross_attention=True`'
-                )
-
-            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
-            cross_attn_past_key_value = past_key_value[
-                -2:] if past_key_value is not None else None
-            cross_attention_outputs = self.crossattention(
-                attention_output,
-                attention_mask,
-                head_mask,
-                encoder_hidden_states,
-                encoder_attention_mask,
-                cross_attn_past_key_value,
-                output_attentions,
-            )
-            attention_output = cross_attention_outputs[0]
-            outputs = outputs + cross_attention_outputs[
-                1:-1]  # add cross attentions if we output attention weights
-
-            # add cross-attn cache to positions 3,4 of present_key_value tuple
-            cross_attn_present_key_value = cross_attention_outputs[-1]
-            present_key_value = present_key_value + cross_attn_present_key_value
-
-        layer_output = apply_chunking_to_forward(self.feed_forward_chunk,
-                                                 self.chunk_size_feed_forward,
-                                                 self.seq_len_dim,
-                                                 attention_output)
-        outputs = (layer_output, ) + outputs
-
-        # if decoder, return the attn key/values as the last output
-        if self.is_decoder:
-            outputs = outputs + (present_key_value, )
-
-        return outputs
-
-    def feed_forward_chunk(self, attention_output):
-        intermediate_output = self.intermediate(attention_output)
-        layer_output = self.output(intermediate_output, attention_output)
-        return layer_output
-
-
-class SbertEncoder(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.layer = nn.ModuleList(
-            [SbertLayer(config) for _ in range(config.num_hidden_layers)])
-        self.gradient_checkpointing = False
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_values=None,
-        use_cache=None,
-        output_attentions=False,
-        output_hidden_states=False,
-        return_dict=True,
-    ):
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attentions = () if output_attentions else None
-        all_cross_attentions = (
-        ) if output_attentions and self.config.add_cross_attention else None
-
-        next_decoder_cache = () if use_cache else None
-        for i, layer_module in enumerate(self.layer):
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states, )
-
-            layer_head_mask = head_mask[i] if head_mask is not None else None
-            past_key_value = past_key_values[
-                i] if past_key_values is not None else None
-
-            if self.gradient_checkpointing and self.training:
-
-                if use_cache:
-                    logger.warning(
-                        '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...'
-                    )
-                    use_cache = False
-
-                def create_custom_forward(module):
-
-                    def custom_forward(*inputs):
-                        return module(*inputs, past_key_value,
-                                      output_attentions)
-
-                    return custom_forward
-
-                layer_outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(layer_module),
-                    hidden_states,
-                    attention_mask,
-                    layer_head_mask,
-                    encoder_hidden_states,
-                    encoder_attention_mask,
-                )
-            else:
-                layer_outputs = layer_module(
-                    hidden_states,
-                    attention_mask,
-                    layer_head_mask,
-                    encoder_hidden_states,
-                    encoder_attention_mask,
-                    past_key_value,
-                    output_attentions,
-                )
-
-            hidden_states = layer_outputs[0]
-            if use_cache:
-                next_decoder_cache += (layer_outputs[-1], )
-            if output_attentions:
-                all_self_attentions = all_self_attentions + (
-                    layer_outputs[1], )
-                if self.config.add_cross_attention:
-                    all_cross_attentions = all_cross_attentions + (
-                        layer_outputs[2], )
-
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states, )
-
-        if not return_dict:
-            return tuple(v for v in [
-                hidden_states,
-                next_decoder_cache,
-                all_hidden_states,
-                all_self_attentions,
-                all_cross_attentions,
-            ] if v is not None)
-        return BaseModelOutputWithPastAndCrossAttentions(
-            last_hidden_state=hidden_states,
-            past_key_values=next_decoder_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attentions,
-            cross_attentions=all_cross_attentions,
-        )
-
-
-class SbertPooler(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.activation = nn.Tanh()
-
-    def forward(self, hidden_states):
-        # We "pool" the model by simply taking the hidden state corresponding
-        # to the first token.
-        first_token_tensor = hidden_states[:, 0]
-        pooled_output = self.dense(first_token_tensor)
-        pooled_output = self.activation(pooled_output)
-        return pooled_output
-
-
-@dataclass
-class SbertForPreTrainingOutput(ModelOutput):
-    """
-    Output type of :class:`~structbert.utils.BertForPreTraining`.
-
-    Args:
-        loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
-            Total loss as the sum of the masked language modeling loss and the next sequence prediction
-            (classification) loss.
-        prediction_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        seq_relationship_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
-            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
-            before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when
-        ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when
-        ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    loss: Optional[torch.FloatTensor] = None
-    prediction_logits: torch.FloatTensor = None
-    seq_relationship_logits: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-
-
-@dataclass
-class BaseModelOutputWithPoolingAndCrossAttentionsWithEmbedding(
-        BaseModelOutputWithPoolingAndCrossAttentions):
-    embedding_output: torch.FloatTensor = None
-    logits: Optional[Union[tuple, torch.FloatTensor]] = None
-    kwargs: dict = None
diff --git a/modelscope/models/nlp/backbones/gpt3/__init__.py b/modelscope/models/nlp/gpt3/__init__.py
similarity index 76%
rename from modelscope/models/nlp/backbones/gpt3/__init__.py
rename to modelscope/models/nlp/gpt3/__init__.py
index b0739c22..076a0c6b 100644
--- a/modelscope/models/nlp/backbones/gpt3/__init__.py
+++ b/modelscope/models/nlp/gpt3/__init__.py
@@ -6,10 +6,12 @@ from modelscope.utils.import_utils import LazyImportModule
 if TYPE_CHECKING:
     from .configuration_gpt3 import GPT3Config
     from .modeling_gpt3 import GPT3Model
+    from .gpt3_for_text_generation import GPT3ForTextGeneration
 else:
     _import_structure = {
         'configuration_gpt3': ['GPT3Config'],
-        'modeling_gpt3': ['GPT3Model']
+        'modeling_gpt3': ['GPT3Model'],
+        'gpt3_for_text_generation': ['GPT3ForTextGeneration'],
     }
 
     import sys
diff --git a/modelscope/models/nlp/backbones/gpt3/configuration_gpt3.py b/modelscope/models/nlp/gpt3/configuration_gpt3.py
similarity index 100%
rename from modelscope/models/nlp/backbones/gpt3/configuration_gpt3.py
rename to modelscope/models/nlp/gpt3/configuration_gpt3.py
diff --git a/modelscope/models/nlp/gpt3_for_text_generation.py b/modelscope/models/nlp/gpt3/gpt3_for_text_generation.py
similarity index 97%
rename from modelscope/models/nlp/gpt3_for_text_generation.py
rename to modelscope/models/nlp/gpt3/gpt3_for_text_generation.py
index 22a6458d..6bdcb431 100644
--- a/modelscope/models/nlp/gpt3_for_text_generation.py
+++ b/modelscope/models/nlp/gpt3/gpt3_for_text_generation.py
@@ -20,7 +20,7 @@ class GPT3ForTextGeneration(TorchModel):
         """
         super().__init__(model_dir, *args, **kwargs)
 
-        from modelscope.models.nlp import GPT3Model
+        from modelscope.models.nlp.gpt3 import GPT3Model
         from transformers import BertTokenizer
 
         self.model = GPT3Model.from_pretrained(model_dir)
diff --git a/modelscope/models/nlp/backbones/gpt3/modeling_gpt3.py b/modelscope/models/nlp/gpt3/modeling_gpt3.py
similarity index 100%
rename from modelscope/models/nlp/backbones/gpt3/modeling_gpt3.py
rename to modelscope/models/nlp/gpt3/modeling_gpt3.py
diff --git a/modelscope/models/nlp/heads/__init__.py b/modelscope/models/nlp/heads/__init__.py
index 6ae43f6d..19194d3a 100644
--- a/modelscope/models/nlp/heads/__init__.py
+++ b/modelscope/models/nlp/heads/__init__.py
@@ -5,9 +5,11 @@ from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
     from .sequence_classification_head import SequenceClassificationHead
+    from .torch_pretrain_head import BertMLMHead, RobertaMLMHead
 else:
     _import_structure = {
-        'sequence_classification_head': ['SequenceClassificationHead']
+        'sequence_classification_head': ['SequenceClassificationHead'],
+        'torch_pretrain_head': ['BertMLMHead', 'RobertaMLMHead'],
     }
 
     import sys
diff --git a/modelscope/models/nlp/heads/sequence_classification_head.py b/modelscope/models/nlp/heads/sequence_classification_head.py
index 8c6e2188..92f3a4ec 100644
--- a/modelscope/models/nlp/heads/sequence_classification_head.py
+++ b/modelscope/models/nlp/heads/sequence_classification_head.py
@@ -1,5 +1,4 @@
-import importlib
-from typing import Dict, List, Optional, Union
+from typing import Dict
 
 import torch
 import torch.nn.functional as F
diff --git a/modelscope/models/nlp/heads/torch_pretrain_head.py b/modelscope/models/nlp/heads/torch_pretrain_head.py
new file mode 100644
index 00000000..6ff6c96f
--- /dev/null
+++ b/modelscope/models/nlp/heads/torch_pretrain_head.py
@@ -0,0 +1,26 @@
+from typing import Dict
+
+import torch
+from transformers.models.bert.modeling_bert import BertOnlyMLMHead
+from transformers.models.roberta.modeling_roberta import RobertaLMHead
+
+from modelscope.metainfo import Heads
+from modelscope.models.base import TorchHead
+from modelscope.models.builder import HEADS
+from modelscope.utils.constant import Tasks
+
+
+@HEADS.register_module(Tasks.fill_mask, module_name=Heads.bert_mlm)
+class BertMLMHead(BertOnlyMLMHead, TorchHead):
+
+    def compute_loss(self, outputs: Dict[str, torch.Tensor],
+                     labels) -> Dict[str, torch.Tensor]:
+        raise NotImplementedError()
+
+
+@HEADS.register_module(Tasks.fill_mask, module_name=Heads.roberta_mlm)
+class RobertaMLMHead(RobertaLMHead, TorchHead):
+
+    def compute_loss(self, outputs: Dict[str, torch.Tensor],
+                     labels) -> Dict[str, torch.Tensor]:
+        raise NotImplementedError()
diff --git a/modelscope/models/nlp/masked_language.py b/modelscope/models/nlp/masked_language.py
index ffe9631d..ff16335f 100644
--- a/modelscope/models/nlp/masked_language.py
+++ b/modelscope/models/nlp/masked_language.py
@@ -1,72 +1,115 @@
-from typing import Dict
+from typing import Any, Dict, Optional, Union
 
 import numpy as np
+from transformers import BertForMaskedLM as BertForMaskedLMTransformer
 
 from modelscope.metainfo import Models
-from modelscope.models import TorchModel
-from modelscope.models.base import Tensor
+from modelscope.models.base import TorchModel
 from modelscope.models.builder import MODELS
+from modelscope.models.nlp.structbert import SbertForMaskedLM
+from modelscope.models.nlp.veco import \
+    VecoForMaskedLM as VecoForMaskedLMTransformer
+from modelscope.outputs import OutputKeys
 from modelscope.utils.constant import Tasks
 
 __all__ = ['BertForMaskedLM', 'StructBertForMaskedLM', 'VecoForMaskedLM']
 
 
-class MaskedLanguageModelBase(TorchModel):
-
-    def __init__(self, model_dir: str, *args, **kwargs):
-        super().__init__(model_dir, *args, **kwargs)
-        self.model = self.build_model()
-
-    def build_model(self):
-        raise NotImplementedError()
-
-    def train(self):
-        return self.model.train()
-
-    def eval(self):
-        return self.model.eval()
-
-    @property
-    def config(self):
-        if hasattr(self.model, 'config'):
-            return self.model.config
-        return None
-
-    def forward(self, input: Dict[str, Tensor]) -> Dict[str, np.ndarray]:
-        """return the result by the model
-
-        Args:
-            input (Dict[str, Any]): the preprocessed data
-
-        Returns:
-            Dict[str, np.ndarray]: results
-        """
-        rst = self.model(
-            input_ids=input['input_ids'],
-            attention_mask=input['attention_mask'],
-            token_type_ids=input['token_type_ids'])
-        return {'logits': rst['logits'], 'input_ids': input['input_ids']}
-
-
 @MODELS.register_module(Tasks.fill_mask, module_name=Models.structbert)
-class StructBertForMaskedLM(MaskedLanguageModelBase):
-
-    def build_model(self):
-        from sofa import SbertForMaskedLM
-        return SbertForMaskedLM.from_pretrained(self.model_dir)
-
-
-@MODELS.register_module(Tasks.fill_mask, module_name=Models.veco)
-class VecoForMaskedLM(MaskedLanguageModelBase):
-
-    def build_model(self):
-        from sofa import VecoForMaskedLM
-        return VecoForMaskedLM.from_pretrained(self.model_dir)
+class StructBertForMaskedLM(TorchModel, SbertForMaskedLM):
+
+    def __init__(self, config, model_dir):
+        super(TorchModel, self).__init__(model_dir)
+        SbertForMaskedLM.__init__(self, config)
+
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                token_type_ids=None,
+                position_ids=None,
+                head_mask=None,
+                labels=None):
+        output = SbertForMaskedLM.forward(
+            self,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            labels=labels)
+        output[OutputKeys.INPUT_IDS] = input_ids
+        return output
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        model_dir = kwargs.get('model_dir')
+        return super(SbertForMaskedLM, StructBertForMaskedLM).from_pretrained(
+            pretrained_model_name_or_path=model_dir, model_dir=model_dir)
 
 
 @MODELS.register_module(Tasks.fill_mask, module_name=Models.bert)
-class BertForMaskedLM(MaskedLanguageModelBase):
+class BertForMaskedLM(TorchModel, BertForMaskedLMTransformer):
+
+    def __init__(self, config, model_dir):
+        super(TorchModel, self).__init__(model_dir)
+        BertForMaskedLMTransformer.__init__(self, config)
+
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                token_type_ids=None,
+                position_ids=None,
+                head_mask=None,
+                labels=None):
+        output = BertForMaskedLMTransformer.forward(
+            self,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            labels=labels)
+        output[OutputKeys.INPUT_IDS] = input_ids
+        return output
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        model_dir = kwargs.get('model_dir')
+        return super(BertForMaskedLMTransformer,
+                     BertForMaskedLM).from_pretrained(
+                         pretrained_model_name_or_path=model_dir,
+                         model_dir=model_dir)
 
-    def build_model(self):
-        from transformers import BertForMaskedLM
-        return BertForMaskedLM.from_pretrained(self.model_dir)
+
+@MODELS.register_module(Tasks.fill_mask, module_name=Models.veco)
+class VecoForMaskedLM(TorchModel, VecoForMaskedLMTransformer):
+
+    def __init__(self, config, model_dir):
+        super(TorchModel, self).__init__(model_dir)
+        VecoForMaskedLMTransformer.__init__(self, config)
+
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                token_type_ids=None,
+                position_ids=None,
+                head_mask=None,
+                labels=None):
+        output = VecoForMaskedLMTransformer.forward(
+            self,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            labels=labels)
+        output[OutputKeys.INPUT_IDS] = input_ids
+        return output
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        model_dir = kwargs.get('model_dir')
+        return super(VecoForMaskedLMTransformer,
+                     VecoForMaskedLM).from_pretrained(
+                         pretrained_model_name_or_path=model_dir,
+                         model_dir=model_dir)
diff --git a/modelscope/models/nlp/palm_v2/__init__.py b/modelscope/models/nlp/palm_v2/__init__.py
new file mode 100644
index 00000000..3a9960ec
--- /dev/null
+++ b/modelscope/models/nlp/palm_v2/__init__.py
@@ -0,0 +1,43 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .configuration_palm import PalmConfig
+    from .modeling_palm import (
+        AbsSummarizer,
+        PalmForConditionalGeneration,
+        Translator,
+    )
+    from .palm_for_text_generation import PalmForTextGeneration
+else:
+    _import_structure = {
+        'configuration_palm': ['PalmConfig'],
+        'modeling_palm':
+        ['AbsSummarizer', 'PalmForConditionalGeneration', 'Translator'],
+        'palm_for_text_generation': ['PalmForTextGeneration'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/nlp/palm_v2/configuration_palm.py b/modelscope/models/nlp/palm_v2/configuration_palm.py
new file mode 100644
index 00000000..3b9e51fb
--- /dev/null
+++ b/modelscope/models/nlp/palm_v2/configuration_palm.py
@@ -0,0 +1,116 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PALM model configuration """
+
+from transformers.configuration_utils import PretrainedConfig
+
+from modelscope.utils import logger as logging
+
+logger = logging.get_logger(__name__)
+
+
+class PalmConfig(PretrainedConfig):
+    r"""
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+
+
+    Args:
+        vocab_size (:obj:`int`, `optional`, defaults to 30522):
+            Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
+            :obj:`inputs_ids` passed when calling :class:`~transformers.BertModel` or
+            :class:`~transformers.TFBertModel`.
+        hidden_size (:obj:`int`, `optional`, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string,
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
+        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
+            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.BertModel` or
+            :class:`~transformers.TFBertModel`.
+        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layernorm_epsilon (:obj:`float`, `optional`, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        dec_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+            Number of hidden layers in the Transformer decoder.
+        attn_separate (:obj:`bool`, `optional`, defaults to false):
+            Whether or not to separate the q, k, v of attention.
+
+    Examples::
+
+        >>> from modelscope.models.nlp.palm_v2 import PalmForConditionalGeneration, PalmConfig
+        >>> configuration = PalmConfig()
+
+        >>> # Initializing a model from the configuration
+        >>> model = PalmForConditionalGeneration(configuration)
+
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
+    """
+    model_type = 'palm'
+
+    def __init__(self,
+                 encoder='roberta',
+                 encoder_pth='roberta-base',
+                 max_pos=512,
+                 share_emb=False,
+                 dec_layers=12,
+                 dec_hidden_size=768,
+                 dec_heads=8,
+                 dec_ff_size=3072,
+                 dec_dropout=0.2,
+                 use_bert_emb=True,
+                 label_smoothing=0.1,
+                 alpha=0.95,
+                 beam_size=5,
+                 min_length=40,
+                 max_length=130,
+                 sample_topk=False,
+                 block_trigram=False,
+                 **kwargs):
+        super().__init__(**kwargs)
+        self.encoder = encoder
+        self.encoder_pth = encoder_pth
+        self.max_pos = max_pos
+        self.share_emb = share_emb
+        self.dec_layers = dec_layers
+        self.dec_hidden_size = dec_hidden_size
+        self.dec_heads = dec_heads
+        self.dec_ff_size = dec_ff_size
+        self.dec_dropout = dec_dropout
+        self.use_bert_emb = use_bert_emb
+        self.label_smoothing = label_smoothing
+        # Translator
+        self.alpha = alpha
+        self.beam_size = beam_size
+        self.min_length = min_length
+        self.max_length = max_length
+        self.sample_topk = sample_topk
+        self.block_trigram = block_trigram
diff --git a/modelscope/models/nlp/palm_v2/dureader_eval.py b/modelscope/models/nlp/palm_v2/dureader_eval.py
new file mode 100644
index 00000000..db54f21d
--- /dev/null
+++ b/modelscope/models/nlp/palm_v2/dureader_eval.py
@@ -0,0 +1,872 @@
+# ==============================================================================
+# Copyright 2017 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+This module computes evaluation metrics for DuReader dataset.
+"""
+
+import argparse
+import copy
+import math
+import re
+import sys
+import zipfile
+from collections import Counter, defaultdict
+
+import json
+import numpy as np
+from rouge import Rouge
+
+EMPTY = ''
+YESNO_LABELS = set(['Yes', 'No', 'Depends'])
+
+
+def my_lcs(string, sub):
+    """
+    Calculates longest common subsequence for a pair of tokenized strings
+    :param string : list of str : tokens from a string split using whitespace
+    :param sub : list of str : shorter string, also split using whitespace
+    :returns: length (list of int): length of the longest common subsequence between the two strings
+
+    Note: my_lcs only gives length of the longest common subsequence, not the actual LCS
+    """
+    if (len(string) < len(sub)):
+        sub, string = string, sub
+
+    lengths = [[0 for i in range(0,
+                                 len(sub) + 1)]
+               for j in range(0,
+                              len(string) + 1)]
+
+    for j in range(1, len(sub) + 1):
+        for i in range(1, len(string) + 1):
+            if (string[i - 1] == sub[j - 1]):
+                lengths[i][j] = lengths[i - 1][j - 1] + 1
+            else:
+                lengths[i][j] = max(lengths[i - 1][j], lengths[i][j - 1])
+
+    return lengths[len(string)][len(sub)]
+
+
+class Bleu:
+
+    def __init__(self, n=4):
+        # default compute Blue score up to 4
+        self._n = n
+        self._hypo_for_image = {}
+        self.ref_for_image = {}
+
+    def compute_score(self, gts, res):
+        assert (list(gts.keys()) == list(res.keys()))
+        imgIds = list(gts.keys())
+
+        bleu_scorer = BleuScorer(n=self._n)
+        for id in imgIds:
+            hypo = res[id]
+            ref = gts[id]
+
+            # Sanity check.
+            assert (type(hypo) is list)
+            assert (len(hypo) == 1)
+            assert (type(ref) is list)
+            assert (len(ref) >= 1)
+
+            bleu_scorer += (hypo[0], ref)
+
+        score, scores = bleu_scorer.compute_score(option='closest', verbose=1)
+        return score, scores
+
+    def method(self):
+        return 'Bleu'
+
+
+def precook(s, n=4, out=False):
+    """Takes a string as input and returns an object that can be given to
+    either cook_refs or cook_test. This is optional: cook_refs and cook_test
+    can take string arguments as well."""
+    words = s.split()
+    counts = defaultdict(int)
+    for k in range(1, n + 1):
+        for i in range(len(words) - k + 1):
+            ngram = tuple(words[i:i + k])
+            counts[ngram] += 1
+    return (len(words), counts)
+
+
+def cook_refs(refs, eff=None, n=4):  # lhuang: oracle will call with "average"
+    '''Takes a list of reference sentences for a single segment
+    and returns an object that encapsulates everything that BLEU
+    needs to know about them.'''
+
+    reflen = []
+    maxcounts = {}
+    for ref in refs:
+        rl, counts = precook(ref, n)
+        reflen.append(rl)
+        for (ngram, count) in counts.items():
+            maxcounts[ngram] = max(maxcounts.get(ngram, 0), count)
+
+    # Calculate effective reference sentence length.
+    if eff == 'shortest':
+        reflen = min(reflen)
+    elif eff == 'average':
+        reflen = float(sum(reflen)) / len(reflen)
+
+    # lhuang: N.B.: leave reflen computaiton to the very end!!
+
+    # lhuang: N.B.: in case of "closest", keep a list of reflens!! (bad design)
+
+    return reflen, maxcounts
+
+
+def cook_test(test, xxx_todo_changeme, eff=None, n=4):
+    '''Takes a test sentence and returns an object that
+    encapsulates everything that BLEU needs to know about it.'''
+    (reflen, refmaxcounts) = xxx_todo_changeme
+    testlen, counts = precook(test, n, True)
+
+    result = {}
+
+    # Calculate effective reference sentence length.
+
+    if eff == 'closest':
+        result['reflen'] = min((abs(ref - testlen), ref) for ref in reflen)[1]
+    else:  # i.e., "average" or "shortest" or None
+        result['reflen'] = reflen
+
+    result['testlen'] = testlen
+
+    result['guess'] = [max(0, testlen - k + 1) for k in range(1, n + 1)]
+
+    result['correct'] = [0] * n
+    for (ngram, count) in counts.items():
+        result['correct'][len(ngram) - 1] += min(
+            refmaxcounts.get(ngram, 0), count)
+
+    return result
+
+
+class BleuScorer(object):
+    """Bleu scorer.
+    """
+
+    __slots__ = 'n', 'crefs', 'ctest', '_score', '_ratio', '_testlen', '_reflen', 'special_reflen'
+
+    # special_reflen is used in oracle (proportional effective ref len for a node).
+
+    def copy(self):
+        ''' copy the refs.'''
+        new = BleuScorer(n=self.n)
+        new.ctest = copy.copy(self.ctest)
+        new.crefs = copy.copy(self.crefs)
+        new._score = None
+        return new
+
+    def __init__(self, test=None, refs=None, n=4, special_reflen=None):
+        ''' singular instance '''
+
+        self.n = n
+        self.crefs = []
+        self.ctest = []
+        self.cook_append(test, refs)
+        self.special_reflen = special_reflen
+
+    def cook_append(self, test, refs):
+        '''called by constructor and __iadd__ to avoid creating new instances.'''
+
+        if refs is not None:
+            self.crefs.append(cook_refs(refs))
+            if test is not None:
+                cooked_test = cook_test(test, self.crefs[-1])
+                self.ctest.append(cooked_test)  # N.B.: -1
+            else:
+                self.ctest.append(
+                    None)  # lens of crefs and ctest have to match
+
+        self._score = None  # need to recompute
+
+    def ratio(self, option=None):
+        self.compute_score(option=option)
+        return self._ratio
+
+    def score_ratio(self, option=None):
+        '''return (bleu, len_ratio) pair'''
+        return (self.fscore(option=option), self.ratio(option=option))
+
+    def score_ratio_str(self, option=None):
+        return '%.4f (%.2f)' % self.score_ratio(option)
+
+    def reflen(self, option=None):
+        self.compute_score(option=option)
+        return self._reflen
+
+    def testlen(self, option=None):
+        self.compute_score(option=option)
+        return self._testlen
+
+    def retest(self, new_test):
+        if type(new_test) is str:
+            new_test = [new_test]
+        assert len(new_test) == len(self.crefs), new_test
+        self.ctest = []
+        for t, rs in zip(new_test, self.crefs):
+            self.ctest.append(cook_test(t, rs))
+        self._score = None
+
+        return self
+
+    def rescore(self, new_test):
+        ''' replace test(s) with new test(s), and returns the new score.'''
+
+        return self.retest(new_test).compute_score()
+
+    def size(self):
+        assert len(self.crefs) == len(
+            self.ctest), 'refs/test mismatch! %d<>%d' % (len(
+                self.crefs), len(self.ctest))
+        return len(self.crefs)
+
+    def __iadd__(self, other):
+        '''add an instance (e.g., from another sentence).'''
+
+        if type(other) is tuple:
+            # avoid creating new BleuScorer instances
+            self.cook_append(other[0], other[1])
+        else:
+            assert self.compatible(other), 'incompatible BLEUs.'
+            self.ctest.extend(other.ctest)
+            self.crefs.extend(other.crefs)
+            self._score = None  # need to recompute
+
+        return self
+
+    def compatible(self, other):
+        return isinstance(other, BleuScorer) and self.n == other.n
+
+    def single_reflen(self, option='average'):
+        return self._single_reflen(self.crefs[0][0], option)
+
+    def _single_reflen(self, reflens, option=None, testlen=None):
+
+        if option == 'shortest':
+            reflen = min(reflens)
+        elif option == 'average':
+            reflen = float(sum(reflens)) / len(reflens)
+        elif option == 'closest':
+            reflen = min((abs(ref - testlen), ref) for ref in reflens)[1]
+        else:
+            assert False, 'unsupported reflen option %s' % option
+
+        return reflen
+
+    def recompute_score(self, option=None, verbose=0):
+        self._score = None
+        return self.compute_score(option, verbose)
+
+    def compute_score(self, option=None, verbose=0):
+        n = self.n
+        small = 1e-9
+        tiny = 1e-15  # so that if guess is 0 still return 0
+        bleu_list = [[] for _ in range(n)]
+
+        if self._score is not None:
+            return self._score
+
+        if option is None:
+            option = 'average' if len(self.crefs) == 1 else 'closest'
+
+        self._testlen = 0
+        self._reflen = 0
+        totalcomps = {
+            'testlen': 0,
+            'reflen': 0,
+            'guess': [0] * n,
+            'correct': [0] * n
+        }
+
+        # for each sentence
+        for comps in self.ctest:
+            testlen = comps['testlen']
+            self._testlen += testlen
+
+            if self.special_reflen is None:  # need computation
+                reflen = self._single_reflen(comps['reflen'], option, testlen)
+            else:
+                reflen = self.special_reflen
+
+            self._reflen += reflen
+
+            for key in ['guess', 'correct']:
+                for k in range(n):
+                    totalcomps[key][k] += comps[key][k]
+
+            # append per image bleu score
+            bleu = 1.
+            for k in range(n):
+                bleu *= (float(comps['correct'][k]) + tiny) / (
+                    float(comps['guess'][k]) + small)
+                bleu_list[k].append(bleu**(1. / (k + 1)))
+            ratio = (testlen + tiny) / (reflen + small
+                                        )  # N.B.: avoid zero division
+            if ratio < 1:
+                for k in range(n):
+                    bleu_list[k][-1] *= math.exp(1 - 1 / ratio)
+
+            if verbose > 1:
+                print(comps, reflen)
+
+        totalcomps['reflen'] = self._reflen
+        totalcomps['testlen'] = self._testlen
+
+        bleus = []
+        bleu = 1.
+        for k in range(n):
+            bleu *= float(totalcomps['correct'][k] + tiny) / (
+                totalcomps['guess'][k] + small)
+            bleus.append(bleu**(1. / (k + 1)))
+        ratio = (self._testlen + tiny) / (self._reflen + small
+                                          )  # N.B.: avoid zero division
+        if ratio < 1:
+            for k in range(n):
+                bleus[k] *= math.exp(1 - 1 / ratio)
+
+        if verbose > 0:
+            print(totalcomps)
+            print('ratio:', ratio)
+
+        self._score = bleus
+        return self._score, bleu_list
+
+
+def normalize(s):
+    """
+    Normalize strings to space joined chars.
+
+    Args:
+        s: a list of strings.
+
+    Returns:
+        A list of normalized strings.
+    """
+    if not s:
+        return s
+    normalized = []
+    for ss in s:
+        tokens = [c for c in list(ss) if len(c.strip()) != 0]
+        normalized.append(' '.join(tokens))
+    return normalized
+
+
+def data_check(obj, task):
+    """
+    Check data.
+
+    Raises:
+        Raises AssertionError when data is not legal.
+    """
+    assert 'question_id' in obj, "Missing 'question_id' field."
+    assert 'question_type' in obj, \
+        "Missing 'question_type' field. question_id: {}".format(obj['question_type'])
+
+    assert 'yesno_answers' in obj, \
+        "Missing 'yesno_answers' field. question_id: {}".format(obj['question_id'])
+    assert isinstance(obj['yesno_answers'], list), \
+        r"""'yesno_answers' field must be a list, if the 'question_type' is not
+            'YES_NO', then this field should be an empty list.
+            question_id: {}""".format(obj['question_id'])
+
+    assert 'entity_answers' in obj, \
+        "Missing 'entity_answers' field. question_id: {}".format(obj['question_id'])
+    assert isinstance(
+        obj['entity_answers'],
+        list) and len(obj['entity_answers']) > 0, r"""'entity_answers' field
+            must be a list, and has at least one element, which can be a empty list.
+            question_id: {}""".format(obj['question_id'])
+
+
+def read_file(file_name, task, is_ref=False):
+    """
+    Read predict answers or reference answers from file.
+
+    Args:
+        file_name: the name of the file containing predict result or reference
+                   result.
+
+    Returns:
+        A dictionary mapping question_id to the result information. The result
+        information itself is also a dictionary with has four keys:
+        - question_type: type of the query.
+        - yesno_answers: A list of yesno answers corresponding to 'answers'.
+        - answers: A list of predicted answers.
+        - entity_answers: A list, each element is also a list containing the entities
+                    tagged out from the corresponding answer string.
+    """
+
+    def _open(file_name, mode, zip_obj=None):
+        if zip_obj is not None:
+            return zip_obj.open(file_name, mode)
+        return open(file_name, mode)
+
+    results = {}
+    keys = ['answers', 'yesno_answers', 'entity_answers', 'question_type']
+    if is_ref:
+        keys += ['source']
+
+    zf = zipfile.ZipFile(file_name,
+                         'r') if file_name.endswith('.zip') else None
+    file_list = [file_name] if zf is None else zf.namelist()
+
+    for fn in file_list:
+        for line in _open(fn, 'r', zip_obj=zf):
+            try:
+                obj = json.loads(line.strip())
+            except ValueError:
+                raise ValueError('Every line of data should be legal json')
+            data_check(obj, task)
+            qid = obj['question_id']
+            assert qid not in results, 'Duplicate question_id: {}'.format(qid)
+            results[qid] = {}
+            for k in keys:
+                results[qid][k] = obj[k]
+    return results
+
+
+def compute_bleu_rouge(pred_dict, ref_dict, bleu_order=4):
+    """
+    Compute bleu and rouge scores.
+    """
+    assert set(pred_dict.keys()) == set(ref_dict.keys()), \
+        'missing keys: {}'.format(set(ref_dict.keys()) - set(pred_dict.keys()))
+    scores = {}
+    bleu_scores, _ = Bleu(bleu_order).compute_score(ref_dict, pred_dict)
+    for i, bleu_score in enumerate(bleu_scores):
+        scores['Bleu-%d' % (i + 1)] = bleu_score
+    # rouge_score, _ = Rouge().compute_score(ref_dict, pred_dict)
+    rouge_score = Rouge().get_scores(
+        list(map(lambda x: x[0], pred_dict.values())),
+        list(map(lambda x: x[0], ref_dict.values())))
+    rouge_score = sum([d['rouge-l']['f']
+                       for d in rouge_score]) / len(rouge_score)
+    scores['Rouge-L'] = rouge_score
+    return scores
+
+
+def local_prf(pred_list, ref_list):
+    """
+    Compute local precision recall and f1-score,
+    given only one prediction list and one reference list
+    """
+    common = Counter(pred_list) & Counter(ref_list)
+    num_same = sum(common.values())
+    if num_same == 0:
+        return 0, 0, 0
+    p = 1.0 * num_same / len(pred_list)
+    r = 1.0 * num_same / len(ref_list)
+    f1 = (2 * p * r) / (p + r)
+    return p, r, f1
+
+
+def compute_prf(pred_dict, ref_dict):
+    """
+    Compute precision recall and f1-score.
+    """
+    # pred_question_ids = set(pred_dict.keys())
+    ref_question_ids = set(ref_dict.keys())
+    correct_preds, total_correct, total_preds = 0, 0, 0
+    for question_id in ref_question_ids:
+        pred_entity_list = pred_dict.get(question_id, [[]])
+        assert len(pred_entity_list) == 1, \
+            'the number of entity list for question_id {} is not 1.'.format(question_id)
+        pred_entity_list = pred_entity_list[0]
+        all_ref_entity_lists = ref_dict[question_id]
+        best_local_f1 = 0
+        best_ref_entity_list = None
+        for ref_entity_list in all_ref_entity_lists:
+            local_f1 = local_prf(pred_entity_list, ref_entity_list)[2]
+            if local_f1 > best_local_f1:
+                best_ref_entity_list = ref_entity_list
+                best_local_f1 = local_f1
+        if best_ref_entity_list is None:
+            if len(all_ref_entity_lists) > 0:
+                best_ref_entity_list = sorted(
+                    all_ref_entity_lists, key=lambda x: len(x))[0]
+            else:
+                best_ref_entity_list = []
+        gold_entities = set(best_ref_entity_list)
+        pred_entities = set(pred_entity_list)
+        correct_preds += len(gold_entities & pred_entities)
+        total_preds += len(pred_entities)
+        total_correct += len(gold_entities)
+    p = float(correct_preds) / total_preds if correct_preds > 0 else 0
+    r = float(correct_preds) / total_correct if correct_preds > 0 else 0
+    f1 = 2 * p * r / (p + r) if correct_preds > 0 else 0
+    return {'Precision': p, 'Recall': r, 'F1': f1}
+
+
+def prepare_prf(pred_dict, ref_dict):
+    """
+    Prepares data for calculation of prf scores.
+    """
+    preds = {k: v['entity_answers'] for k, v in pred_dict.items()}
+    refs = {k: v['entity_answers'] for k, v in ref_dict.items()}
+    return preds, refs
+
+
+def filter_dict(result_dict, key_tag):
+    """
+    Filter a subset of the result_dict, where keys ends with 'key_tag'.
+    """
+    filtered = {}
+    for k, v in result_dict.items():
+        if k.endswith(key_tag):
+            filtered[k] = v
+    return filtered
+
+
+def get_metrics(pred_result, ref_result, task, source):
+    """
+    Computes metrics.
+    """
+    metrics = {}
+
+    ref_result_filtered = {}
+    pred_result_filtered = {}
+    if source == 'both':
+        ref_result_filtered = ref_result
+        pred_result_filtered = pred_result
+    else:
+        for question_id, info in ref_result.items():
+            if info['source'] == source:
+                ref_result_filtered[question_id] = info
+                if question_id in pred_result:
+                    pred_result_filtered[question_id] = pred_result[
+                        question_id]
+
+    if task == 'main' or task == 'all' \
+            or task == 'description':
+        pred_dict, ref_dict = prepare_bleu(pred_result_filtered,
+                                           ref_result_filtered, task)
+        metrics = compute_bleu_rouge(pred_dict, ref_dict)
+    elif task == 'yesno':
+        pred_dict, ref_dict = prepare_bleu(pred_result_filtered,
+                                           ref_result_filtered, task)
+        keys = ['Yes', 'No', 'Depends']
+        preds = [filter_dict(pred_dict, k) for k in keys]
+        refs = [filter_dict(ref_dict, k) for k in keys]
+
+        metrics = compute_bleu_rouge(pred_dict, ref_dict)
+
+        for k, pred, ref in zip(keys, preds, refs):
+            m = compute_bleu_rouge(pred, ref)
+            k_metric = [(k + '|' + key, v) for key, v in m.items()]
+            metrics.update(k_metric)
+
+    elif task == 'entity':
+        pred_dict, ref_dict = prepare_prf(pred_result_filtered,
+                                          ref_result_filtered)
+        pred_dict_bleu, ref_dict_bleu = prepare_bleu(pred_result_filtered,
+                                                     ref_result_filtered, task)
+        metrics = compute_prf(pred_dict, ref_dict)
+        metrics.update(compute_bleu_rouge(pred_dict_bleu, ref_dict_bleu))
+    else:
+        raise ValueError('Illegal task name: {}'.format(task))
+
+    return metrics
+
+
+def prepare_bleu(pred_result, ref_result, task):
+    """
+    Prepares data for calculation of bleu and rouge scores.
+    """
+    pred_list, ref_list = [], []
+    qids = ref_result.keys()
+    for qid in qids:
+        if task == 'main':
+            pred, ref = get_main_result(qid, pred_result, ref_result)
+        elif task == 'yesno':
+            pred, ref = get_yesno_result(qid, pred_result, ref_result)
+        elif task == 'all':
+            pred, ref = get_all_result(qid, pred_result, ref_result)
+        elif task == 'entity':
+            pred, ref = get_entity_result(qid, pred_result, ref_result)
+        elif task == 'description':
+            pred, ref = get_desc_result(qid, pred_result, ref_result)
+        else:
+            raise ValueError('Illegal task name: {}'.format(task))
+        if pred and ref:
+            pred_list += pred
+            ref_list += ref
+    pred_dict = dict(pred_list)
+    ref_dict = dict(ref_list)
+    for qid, ans in ref_dict.items():
+        ref_dict[qid] = normalize(ref_dict[qid])
+        pred_dict[qid] = normalize(pred_dict.get(qid, [EMPTY]))
+        if not ans or ans == [EMPTY]:
+            del ref_dict[qid]
+            del pred_dict[qid]
+
+    for k, v in pred_dict.items():
+        assert len(v) == 1, \
+            'There should be only one predict answer. question_id: {}'.format(k)
+    return pred_dict, ref_dict
+
+
+def get_main_result(qid, pred_result, ref_result):
+    """
+    Prepare answers for task 'main'.
+
+    Args:
+        qid: question_id.
+        pred_result: A dict include all question_id's result information read
+                     from args.pred_file.
+        ref_result: A dict incluce all question_id's result information read
+                    from args.ref_file.
+    Returns:
+        Two lists, the first one contains predict result, the second
+        one contains reference result of the same question_id. Each list has
+        elements of tuple (question_id, answers), 'answers' is a list of strings.
+    """
+    ref_ans = ref_result[qid]['answers']
+    if not ref_ans:
+        ref_ans = [EMPTY]
+    pred_ans = pred_result.get(qid, {}).get('answers', [])[:1]
+    if not pred_ans:
+        pred_ans = [EMPTY]
+
+    return [(qid, pred_ans)], [(qid, ref_ans)]
+
+
+def get_entity_result(qid, pred_result, ref_result):
+    """
+    Prepare answers for task 'entity'.
+
+    Args:
+        qid: question_id.
+        pred_result: A dict include all question_id's result information read
+                     from args.pred_file.
+        ref_result: A dict incluce all question_id's result information read
+                    from args.ref_file.
+    Returns:
+        Two lists, the first one contains predict result, the second
+        one contains reference result of the same question_id. Each list has
+        elements of tuple (question_id, answers), 'answers' is a list of strings.
+    """
+    if ref_result[qid]['question_type'] != 'ENTITY':
+        return None, None
+    return get_main_result(qid, pred_result, ref_result)
+
+
+def get_desc_result(qid, pred_result, ref_result):
+    """
+    Prepare answers for task 'description'.
+
+    Args:
+        qid: question_id.
+        pred_result: A dict include all question_id's result information read
+                     from args.pred_file.
+        ref_result: A dict incluce all question_id's result information read
+                    from args.ref_file.
+    Returns:
+        Two lists, the first one contains predict result, the second
+        one contains reference result of the same question_id. Each list has
+        elements of tuple (question_id, answers), 'answers' is a list of strings.
+    """
+    if ref_result[qid]['question_type'] != 'DESCRIPTION':
+        return None, None
+    return get_main_result(qid, pred_result, ref_result)
+
+
+def get_yesno_result(qid, pred_result, ref_result):
+    """
+    Prepare answers for task 'yesno'.
+
+    Args:
+        qid: question_id.
+        pred_result: A dict include all question_id's result information read
+                     from args.pred_file.
+        ref_result: A dict incluce all question_id's result information read
+                    from args.ref_file.
+    Returns:
+        Two lists, the first one contains predict result, the second
+        one contains reference result of the same question_id. Each list has
+        elements of tuple (question_id, answers), 'answers' is a list of strings.
+    """
+
+    def _uniq(li, is_ref):
+        uniq_li = []
+        left = []
+        keys = set()
+        for k, v in li:
+            if k not in keys:
+                uniq_li.append((k, v))
+                keys.add(k)
+            else:
+                left.append((k, v))
+
+        if is_ref:
+            dict_li = dict(uniq_li)
+            for k, v in left:
+                dict_li[k] += v
+            uniq_li = [(k, v) for k, v in dict_li.items()]
+        return uniq_li
+
+    def _expand_result(uniq_li):
+        expanded = uniq_li[:]
+        keys = set([x[0] for x in uniq_li])
+        for k in YESNO_LABELS - keys:
+            expanded.append((k, [EMPTY]))
+        return expanded
+
+    def _get_yesno_ans(qid, result_dict, is_ref=False):
+        if qid not in result_dict:
+            return [(str(qid) + '_' + k, v) for k, v in _expand_result([])]
+        yesno_answers = result_dict[qid]['yesno_answers']
+        answers = result_dict[qid]['answers']
+        lbl_ans = _uniq([(k, [v]) for k, v in zip(yesno_answers, answers)],
+                        is_ref)
+        ret = [(str(qid) + '_' + k, v) for k, v in _expand_result(lbl_ans)]
+        return ret
+
+    if ref_result[qid]['question_type'] != 'YES_NO':
+        return None, None
+
+    ref_ans = _get_yesno_ans(qid, ref_result, is_ref=True)
+    pred_ans = _get_yesno_ans(qid, pred_result)
+    return pred_ans, ref_ans
+
+
+def get_all_result(qid, pred_result, ref_result):
+    """
+    Prepare answers for task 'all'.
+
+    Args:
+        qid: question_id.
+        pred_result: A dict include all question_id's result information read
+                     from args.pred_file.
+        ref_result: A dict incluce all question_id's result information read
+                    from args.ref_file.
+    Returns:
+        Two lists, the first one contains predict result, the second
+        one contains reference result of the same question_id. Each list has
+        elements of tuple (question_id, answers), 'answers' is a list of strings.
+    """
+    if ref_result[qid]['question_type'] == 'YES_NO':
+        return get_yesno_result(qid, pred_result, ref_result)
+    return get_main_result(qid, pred_result, ref_result)
+
+
+def format_metrics(metrics, task, err_msg):
+    """
+    Format metrics. 'err' field returns any error occured during evaluation.
+
+    Args:
+        metrics: A dict object contains metrics for different tasks.
+        task: Task name.
+        err_msg: Exception raised during evaluation.
+    Returns:
+        Formatted result.
+    """
+    result = {}
+    sources = ['both', 'search', 'zhidao']
+    if err_msg is not None:
+        return {'errorMsg': str(err_msg), 'errorCode': 1, 'data': []}
+    data = []
+    if task != 'all' and task != 'main':
+        sources = ['both']
+
+    if task == 'entity':
+        metric_names = ['Bleu-4', 'Rouge-L']
+        metric_names_prf = ['F1', 'Precision', 'Recall']
+        for name in metric_names + metric_names_prf:
+            for src in sources:
+                obj = {
+                    'name': name,
+                    'value': round(metrics[src].get(name, 0) * 100, 2),
+                    'type': src,
+                }
+                data.append(obj)
+    elif task == 'yesno':
+        metric_names = ['Bleu-4', 'Rouge-L']
+        details = ['Yes', 'No', 'Depends']
+        src = sources[0]
+        for name in metric_names:
+            obj = {
+                'name': name,
+                'value': round(metrics[src].get(name, 0) * 100, 2),
+                'type': 'All',
+            }
+            data.append(obj)
+            for d in details:
+                obj = {
+                    'name': name,
+                    'value': round(metrics[src].get(d + '|' + name, 0) * 100,
+                                   2),
+                    'type': d
+                }
+                data.append(obj)
+    else:
+        metric_names = ['Bleu-4', 'Rouge-L']
+        for name in metric_names:
+            for src in sources:
+                obj = {
+                    'name': name,
+                    'value': round(metrics[src].get(name, 0) * 100, 2),
+                    'type': src
+                }
+                data.append(obj)
+
+    result['data'] = data
+    result['errorCode'] = 0
+    result['errorMsg'] = 'success'
+
+    return result
+
+
+def main(args):
+    """
+    Do evaluation.
+    """
+    err = None
+    metrics = {}
+    try:
+        pred_result = read_file(args.pred_file, args.task)
+        ref_result = read_file(args.ref_file, args.task, is_ref=True)
+        sources = ['both', 'search', 'zhidao']
+        if args.task not in set(['main', 'all']):
+            sources = sources[:1]
+        for source in sources:
+            metrics[source] = get_metrics(pred_result, ref_result, args.task,
+                                          source)
+    except ValueError as ve:
+        err = ve
+    except AssertionError as ae:
+        err = ae
+
+    print(
+        json.dumps(
+            format_metrics(metrics, args.task, err),
+            ensure_ascii=False).encode('utf8'))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('pred_file', help='predict file')
+    parser.add_argument('ref_file', help='reference file')
+    parser.add_argument(
+        'task', help='task name: Main|Yes_No|All|Entity|Description')
+
+    args = parser.parse_args()
+    args.task = args.task.lower().replace('_', '')
+    main(args)
diff --git a/modelscope/models/nlp/palm_v2/modeling_palm.py b/modelscope/models/nlp/palm_v2/modeling_palm.py
new file mode 100644
index 00000000..c2121cfd
--- /dev/null
+++ b/modelscope/models/nlp/palm_v2/modeling_palm.py
@@ -0,0 +1,1332 @@
+import codecs
+import copy
+import math
+import os
+import subprocess
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Union
+
+import json
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.nn.init import xavier_uniform_
+from transformers import (BertConfig, BertModel, BertTokenizer, RobertaConfig,
+                          RobertaModel, RobertaTokenizer)
+from transformers.activations import ACT2FN
+from transformers.modeling_utils import PreTrainedModel
+
+from modelscope.outputs import OutputKeys
+from modelscope.utils import logger as logging
+from .configuration_palm import PalmConfig
+from .dureader_eval import compute_bleu_rouge, normalize
+
+CONFIG_NAME = 'config.json'
+WEIGHTS_NAME = 'pytorch_model.bin'
+
+
+class MultiHeadedAttention(nn.Module):  # SelfAttention
+    """
+    Multi-Head Attention module from
+    "Attention is All You Need"
+    :cite:`DBLP:journals/corr/VaswaniSPUJGKP17`.
+
+    Similar to standard `dot` attention but uses
+    multiple attention distributions simulataneously
+    to select relevant items.
+
+    .. mermaid::
+
+       graph BT
+          A[key]
+          B[value]
+          C[query]
+          O[output]
+          subgraph Attn
+            D[Attn 1]
+            E[Attn 2]
+            F[Attn N]
+          end
+          A --> D
+          C --> D
+          A --> E
+          C --> E
+          A --> F
+          C --> F
+          D --> O
+          E --> O
+          F --> O
+          B --> O
+
+    Also includes several additional tricks.
+
+    Args:
+       head_count (int): number of parallel heads
+       model_dim (int): the dimension of keys/values/queries,
+           must be divisible by head_count
+       dropout (float): dropout parameter
+    """
+
+    def __init__(self,
+                 head_count,
+                 model_dim,
+                 dropout=0.1,
+                 use_final_linear=True):
+        assert model_dim % head_count == 0
+        self.dim_per_head = model_dim // head_count
+        self.model_dim = model_dim
+
+        super().__init__()
+        self.head_count = head_count
+
+        self.linear_keys = nn.Linear(model_dim, head_count * self.dim_per_head)
+        self.linear_values = nn.Linear(model_dim,
+                                       head_count * self.dim_per_head)
+        self.linear_query = nn.Linear(model_dim,
+                                      head_count * self.dim_per_head)
+        self.softmax = nn.Softmax(dim=-1)
+        self.dropout = nn.Dropout(dropout)
+        self.use_final_linear = use_final_linear
+        if (self.use_final_linear):
+            self.final_linear = nn.Linear(model_dim, model_dim)
+
+    def forward(self,
+                key,
+                value,
+                query,
+                mask=None,
+                layer_cache=None,
+                type=None,
+                predefined_graph_1=None,
+                return_attn=False):
+        """
+        Compute the context vector and the attention vectors.
+
+        Args:
+           key (`FloatTensor`): set of `key_len`
+                key vectors `[batch, key_len, dim]`
+           value (`FloatTensor`): set of `key_len`
+                value vectors `[batch, key_len, dim]`
+           query (`FloatTensor`): set of `query_len`
+                 query vectors  `[batch, query_len, dim]`
+           mask: binary mask indicating which keys have
+                 non-zero attention `[batch, query_len, key_len]`
+        Returns:
+           (`FloatTensor`, `FloatTensor`) :
+
+           * output context vectors `[batch, query_len, dim]`
+           * one of the attention vectors `[batch, query_len, key_len]`
+        """
+
+        batch_size = key.size(0)
+        dim_per_head = self.dim_per_head
+        head_count = self.head_count
+
+        def shape(x):
+            """  projection """
+            return x.view(batch_size, -1, head_count, dim_per_head) \
+                .transpose(1, 2)
+
+        def unshape(x):
+            """  compute context """
+            return x.transpose(1, 2).contiguous() \
+                .view(batch_size, -1, head_count * dim_per_head)
+
+        # 1) Project key, value, and query.
+        if layer_cache is not None:
+            if type == 'self':
+                query, key, value = self.linear_query(query), self.linear_keys(
+                    query), self.linear_values(query)
+
+                key = shape(key)
+                value = shape(value)
+
+                if layer_cache is not None:
+                    device = key.device
+                    if layer_cache['self_keys'] is not None:
+                        key = torch.cat(
+                            (layer_cache['self_keys'].to(device), key), dim=2)
+                    if layer_cache['self_values'] is not None:
+                        value = torch.cat(
+                            (layer_cache['self_values'].to(device), value),
+                            dim=2)
+                    layer_cache['self_keys'] = key
+                    layer_cache['self_values'] = value
+            elif type == 'context':
+                query = self.linear_query(query)
+                if layer_cache is not None:
+                    if layer_cache['memory_keys'] is None:
+                        key, value = self.linear_keys(key), self.linear_values(
+                            value)
+                        key = shape(key)
+                        value = shape(value)
+                    else:
+                        key, value = layer_cache['memory_keys'], layer_cache[
+                            'memory_values']
+                    layer_cache['memory_keys'] = key
+                    layer_cache['memory_values'] = value
+                else:
+                    key, value = self.linear_keys(key), self.linear_values(
+                        value)
+                    key = shape(key)
+                    value = shape(value)
+        else:
+            key = self.linear_keys(key)
+            value = self.linear_values(value)
+            query = self.linear_query(query)
+            key = shape(key)
+            value = shape(value)
+
+        query = shape(query)
+
+        # 2) Calculate and scale scores.
+        query = query / math.sqrt(dim_per_head)
+        scores = torch.matmul(query, key.transpose(2, 3))
+
+        if mask is not None:
+            mask = mask.unsqueeze(1).expand_as(scores)
+            scores = scores.masked_fill(mask, -1e18)
+
+        # 3) Apply attention dropout and compute context vectors.
+
+        attn = self.softmax(scores)
+
+        if predefined_graph_1 is not None:
+            attn_masked = attn[:, -1] * predefined_graph_1
+            attn_masked = attn_masked / (
+                torch.sum(attn_masked, 2).unsqueeze(2) + 1e-9)
+
+            attn = torch.cat([attn[:, :-1], attn_masked.unsqueeze(1)], 1)
+
+        drop_attn = self.dropout(attn)
+        if self.use_final_linear:
+            context = unshape(torch.matmul(drop_attn, value))
+            output = self.final_linear(context)
+            if return_attn:
+                return output, attn
+            else:
+                return output
+        else:
+            context = torch.matmul(drop_attn, value)
+            if return_attn:
+                return context, attn
+            else:
+                return context
+
+
+class PositionwiseFeedForward(nn.Module):  # Output
+    """ A two-layer Feed-Forward-Network with residual layer norm.
+
+    Args:
+        d_model (int): the size of input for the first-layer of the FFN.
+        d_ff (int): the hidden layer size of the second-layer
+            of the FNN.
+        dropout (float): dropout probability in :math:`[0, 1)`.
+    """
+
+    def __init__(self, d_model, d_ff, dropout=0.1):
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
+        self.w_1 = nn.Linear(d_model, d_ff)
+        self.actv = ACT2FN['gelu_new']
+        self.dropout_1 = nn.Dropout(dropout)
+        self.w_2 = nn.Linear(d_ff, d_model)
+        self.dropout_2 = nn.Dropout(dropout)
+
+    def forward(self, x):
+        inter = self.dropout_1(self.actv(self.w_1(self.layer_norm(x))))
+        output = self.dropout_2(self.w_2(inter))
+        return output + x
+
+
+class TransformerDecoderLayer(nn.Module):  # Layer
+    """
+    Args:
+      d_model (int): the dimension of keys/values/queries in
+                       MultiHeadedAttention, also the input size of
+                       the first-layer of the PositionwiseFeedForward.
+      heads (int): the number of heads for MultiHeadedAttention.
+      d_ff (int): the second-layer of the PositionwiseFeedForward.
+      dropout (float): dropout probability(0-1.0).
+      self_attn_type (string): type of self-attention scaled-dot, average
+    """
+    MAX_SIZE = 5000
+
+    def __init__(self, d_model, heads, d_ff, dropout):
+        super().__init__()
+
+        self.self_attn = MultiHeadedAttention(heads, d_model, dropout=dropout)
+
+        self.context_attn = MultiHeadedAttention(
+            heads, d_model, dropout=dropout)
+        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
+        self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6)
+        self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6)
+        self.drop = nn.Dropout(dropout)
+        mask = self._get_attn_subsequent_mask(self.MAX_SIZE)
+        # Register self.mask as a buffer in TransformerDecoderLayer, so
+        # it gets TransformerDecoderLayer's cuda behavior automatically.
+        self.register_buffer('mask', mask)
+
+    def forward(self,
+                inputs,
+                memory_bank,
+                src_pad_mask,
+                tgt_pad_mask,
+                previous_input=None,
+                layer_cache=None,
+                step=None):
+        """
+        Args:
+            inputs (`FloatTensor`): `[batch_size x 1 x model_dim]`
+            memory_bank (`FloatTensor`): `[batch_size x src_len x model_dim]`
+            src_pad_mask (`LongTensor`): `[batch_size x 1 x src_len]`
+            tgt_pad_mask (`LongTensor`): `[batch_size x 1 x 1]`
+
+        Returns:
+            (`FloatTensor`, `FloatTensor`, `FloatTensor`):
+
+            * output `[batch_size x 1 x model_dim]`
+            * attn `[batch_size x 1 x src_len]`
+            * all_input `[batch_size x current_step x model_dim]`
+
+        """
+        dec_mask = torch.gt(
+            tgt_pad_mask.type(torch.uint8)
+            + self.mask[:, :tgt_pad_mask.size(1), :tgt_pad_mask.size(1)].type(
+                torch.uint8), 0)
+        input_norm = self.layer_norm_1(inputs)
+        all_input = input_norm
+        if previous_input is not None:
+            all_input = torch.cat((previous_input, input_norm), dim=1)
+            dec_mask = None
+
+        query = self.self_attn(
+            all_input,
+            all_input,
+            input_norm,
+            mask=dec_mask,
+            layer_cache=layer_cache,
+            type='self')
+
+        query = self.drop(query) + inputs
+
+        query_norm = self.layer_norm_2(query)
+        mid, attn = self.context_attn(
+            memory_bank,
+            memory_bank,
+            query_norm,
+            mask=src_pad_mask,
+            layer_cache=layer_cache,
+            type='context',
+            return_attn=True)
+        output = self.feed_forward(self.drop(mid) + query)
+
+        return output, attn, all_input
+
+    def _get_attn_subsequent_mask(self, size):
+        """
+        Get an attention mask to avoid using the subsequent info.
+
+        Args:
+            size: int
+
+        Returns:
+            (`LongTensor`):
+
+            * subsequent_mask `[1 x size x size]`
+        """
+        attn_shape = (1, size, size)
+        subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
+        subsequent_mask = torch.from_numpy(subsequent_mask)
+        return subsequent_mask
+
+
+class PositionalEncoding(nn.Module):
+
+    def __init__(self, dropout, dim, max_len=5000):
+        super().__init__()
+        pe = torch.zeros(max_len, dim)
+        position = torch.arange(0, max_len).unsqueeze(1)
+        div_term = torch.exp((torch.arange(0, dim, 2, dtype=torch.float)
+                              * -(math.log(10000.0) / dim)))
+        pe[:, 0::2] = torch.sin(position.float() * div_term)
+        pe[:, 1::2] = torch.cos(position.float() * div_term)
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+        self.dropout = nn.Dropout(dropout)
+        self.dim = dim
+
+    def forward(self, emb, step=None):
+        emb = emb * math.sqrt(self.dim)
+        if (step):
+            emb = emb + self.pe[:, step][:, None, :]
+
+        else:
+            emb = emb + self.pe[:, :emb.size(1)]
+        emb = self.dropout(emb)
+        return emb
+
+    def get_emb(self, emb):
+        return self.pe[:, :emb.size(1)]
+
+
+class TransformerDecoder(nn.Module):  # Decoder
+    """
+    The Transformer decoder from "Attention is All You Need".
+
+
+    .. mermaid::
+
+       graph BT
+          A[input]
+          B[multi-head self-attn]
+          BB[multi-head src-attn]
+          C[feed forward]
+          O[output]
+          A --> B
+          B --> BB
+          BB --> C
+          C --> O
+
+
+    Args:
+       num_layers (int): number of encoder layers.
+       d_model (int): size of the model
+       heads (int): number of heads
+       d_ff (int): size of the inner FF layer
+       dropout (float): dropout parameters
+       embeddings (:obj:`onmt.modules.Embeddings`):
+          embeddings to use, should have positional encodings
+       attn_type (str): if using a seperate copy attention
+    """
+    decoder_type = 'transformer'
+
+    class TransformerDecoderState:
+
+        def __init__(self, src):
+            self.src = src
+            self.previous_input = None
+            self.previous_layer_inputs = None
+            self.cache = None
+
+        def update_state(self, new_input, previous_layer_inputs):
+            self.previous_input = new_input
+            self.previous_layer_inputs = previous_layer_inputs
+            self.cache = None
+
+        def _init_cache(self, num_layers):
+            self.cache = {}
+            for num in range(num_layers):
+                layer_cache = {
+                    'memory_keys': None,
+                    'memory_values': None,
+                    'self_keys': None,
+                    'self_values': None
+                }
+                self.cache['layer_{}'.format(num)] = layer_cache
+
+        def map_batch_fn(self, fn):
+
+            def _recursive_map(struct, batch_dim=0):
+                for k, v in struct.items():
+                    if v is not None:
+                        if isinstance(v, dict):
+                            _recursive_map(v)
+                        else:
+                            struct[k] = fn(v, batch_dim)
+
+            self.src = fn(self.src, 0)
+            if self.cache is not None:
+                _recursive_map(self.cache)
+
+    def __init__(self, num_layers, d_model, heads, d_ff, dropout, embeddings):
+        super().__init__()
+
+        # Basic attributes.
+        self.num_layers = num_layers
+        self.embeddings = embeddings
+        self.pos_emb = PositionalEncoding(dropout,
+                                          self.embeddings.embedding_dim)
+
+        # Build TransformerDecoder.
+        self.transformer_layers = nn.ModuleList([
+            TransformerDecoderLayer(d_model, heads, d_ff, dropout)
+            for _ in range(num_layers)
+        ])
+        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
+        self.state = None
+
+    def init_state(self, src, with_cache=False):
+        self.state = self.TransformerDecoderState(src)
+        if with_cache:
+            self.state._init_cache(self.num_layers)
+
+    def forward(self, tgt, memory_bank, step=None, memory_masks=None):
+        src_words = self.state.src
+        tgt_words = tgt
+        src_batch, src_len = src_words.size()
+        tgt_batch, tgt_len = tgt_words.size()
+
+        # Run the forward pass of the TransformerDecoder.
+        # emb = self.embeddings(tgt, step=step)
+        emb = self.embeddings(tgt)
+        assert emb.dim() == 3  # len x batch x embedding_dim
+        output = self.pos_emb(emb, step)
+
+        src_memory_bank = memory_bank
+        padding_idx = self.embeddings.padding_idx
+        tgt_pad_mask = tgt_words.data.eq(padding_idx).unsqueeze(1) \
+            .expand(tgt_batch, tgt_len, tgt_len)
+
+        if memory_masks is not None:
+            src_len = memory_masks.size(-1)
+            src_pad_mask = memory_masks.expand(src_batch, tgt_len, src_len)
+        else:
+            src_pad_mask = src_words.data.eq(padding_idx).unsqueeze(1) \
+                .expand(src_batch, tgt_len, src_len)
+
+        if self.state.cache is None:
+            saved_inputs = []
+        attns = []
+        for i in range(self.num_layers):
+            prev_layer_input = None
+            if self.state.cache is None:
+                if self.state.previous_input is not None:
+                    prev_layer_input = self.state.previous_layer_inputs[i]
+            output, attn, all_input \
+                = self.transformer_layers[i](output, src_memory_bank, src_pad_mask, tgt_pad_mask,
+                                             previous_input=prev_layer_input,
+                                             layer_cache=self.state.cache['layer_{}'.format(i)]
+                                             if self.state.cache is not None else None, step=step)
+            if self.state.cache is None:
+                saved_inputs.append(all_input)
+            attns.append(attn)
+
+        if self.state.cache is None:
+            saved_inputs = torch.stack(saved_inputs)
+
+        output = self.layer_norm(output)
+
+        # Process the result and update the attentions.
+        if self.state.cache is None:
+            self.state.update_state(tgt, saved_inputs)
+
+        return output, attns
+
+
+class PalmPointerGenerator(nn.Module):
+
+    def __init__(self, hidden_size, vocab_size):
+        super().__init__()
+        self.dense = nn.Linear(hidden_size, vocab_size)
+        self.gen_func = nn.LogSoftmax(-1)
+
+    def forward(self, x):
+        x = self.dense(x)
+        x = self.gen_func(x)
+        return x
+
+
+class PalmPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = PalmConfig
+    base_model_prefix = 'palm'
+
+    @classmethod
+    def from_pretrained(
+            cls, pretrained_model_name_or_path: Optional[Union[str,
+                                                               os.PathLike]],
+            **kwargs):
+        config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
+        config = PalmConfig.from_json_file(config_file) if os.path.isfile(
+            config_file) else PalmConfig()
+        config.encoder_pth = os.path.join(pretrained_model_name_or_path,
+                                          config.encoder_pth)
+        checkpoint_file = os.path.join(pretrained_model_name_or_path,
+                                       WEIGHTS_NAME)
+        checkpoint = torch.load(checkpoint_file) if os.path.isfile(
+            checkpoint_file) else None
+        return cls(config, checkpoint, **kwargs)
+
+
+class AbsSummarizer(PalmPreTrainedModel):  # Model
+
+    def __init__(self, config, checkpoint=None):
+        super().__init__(config)
+        self.config = config
+        if config.encoder == 'bert' or config.encoder == 'zh_bert':
+            self.bert = BertModel(
+                BertConfig.from_pretrained(config.encoder_pth))
+        elif config.encoder == 'roberta':
+            self.bert = RobertaModel(
+                RobertaConfig.from_pretrained(config.encoder_pth))
+
+        if (config.max_pos > 512):
+            my_pos_embeddings = nn.Embedding(
+                config.max_pos, self.bert.model.config.hidden_size)
+            my_pos_embeddings.weight.data[:512] = \
+                self.bert.embeddings.position_embeddings.weight.data
+            my_pos_embeddings.weight.data[512:] = \
+                self.bert.embeddings.position_embeddings.weight.data[-1][None, :].repeat(config.max_pos - 512, 1)
+            self.bert.model.embeddings.position_embeddings = my_pos_embeddings
+        self.vocab_size = self.bert.config.vocab_size
+        tgt_embeddings = nn.Embedding(
+            self.vocab_size,
+            self.bert.config.hidden_size,
+            padding_idx=1 if config.encoder == 'roberta' else 0)
+
+        if config.share_emb:
+            tgt_embeddings.weight = copy.deepcopy(
+                self.bert.model.embeddings.word_embeddings.weight)
+        self.decoder = TransformerDecoder(
+            config.dec_layers,
+            config.dec_hidden_size,
+            heads=config.dec_heads,
+            d_ff=config.dec_ff_size,
+            dropout=config.dec_dropout,
+            embeddings=tgt_embeddings)
+        self.generator = PalmPointerGenerator(config.dec_hidden_size,
+                                              self.vocab_size)
+        self.generator.dense.weight = self.decoder.embeddings.weight
+
+        if checkpoint is not None:
+            for key in list(checkpoint['model'].keys()):
+                checkpoint['model'][key.replace('module.',
+                                                '')] = checkpoint['model'][key]
+            msg = self.load_state_dict(checkpoint['model'], strict=False)
+            print(msg)
+        else:
+            for module in self.decoder.modules():
+                if isinstance(module, (nn.Linear, nn.Embedding)):
+                    module.weight.data.normal_(mean=0.0, std=0.02)
+                elif isinstance(module, nn.LayerNorm):
+                    module.bias.data.zero_()
+                    module.weight.data.fill_(1.0)
+                if isinstance(module, nn.Linear) and module.bias is not None:
+                    module.bias.data.zero_()
+            for p in self.generator.parameters():
+                if p.dim() > 1:
+                    xavier_uniform_(p)
+                else:
+                    p.data.zero_()
+            if config.use_bert_emb:
+                if config.encoder == 'roberta':
+                    tgt_embeddings = nn.Embedding(
+                        self.vocab_size,
+                        self.bert.config.hidden_size,
+                        padding_idx=1)
+                else:
+                    tgt_embeddings = nn.Embedding(
+                        self.vocab_size,
+                        self.bert.config.hidden_size,
+                        padding_idx=0)
+                tgt_embeddings.weight = copy.deepcopy(
+                    self.bert.embeddings.word_embeddings.weight)
+                self.decoder.embeddings = tgt_embeddings
+            self.generator.dense.weight = self.decoder.embeddings.weight
+
+    def forward(self, src, tgt, mask_src):
+        top_vec, _ = self.bert(src, mask_src, return_dict=False)
+        self.decoder.init_state(src)
+        decoder_outputs, attns = self.decoder(tgt[:, :-1], top_vec)
+        return decoder_outputs, attns[-1], top_vec
+
+
+class LabelSmoothingLoss(nn.Module):
+    """
+    With label smoothing,
+    KL-divergence between q_{smoothed ground truth prob.}(w)
+    and p_{prob. computed by model}(w) is minimized.
+    """
+
+    def __init__(self, label_smoothing, tgt_vocab_size, ignore_index=-100):
+        assert 0.0 < label_smoothing <= 1.0
+        self.padding_idx = ignore_index
+        super(LabelSmoothingLoss, self).__init__()
+
+        smoothing_value = label_smoothing / (tgt_vocab_size - 2)
+        one_hot = torch.full((tgt_vocab_size, ), smoothing_value)
+        one_hot[self.padding_idx] = 0
+        self.register_buffer('one_hot', one_hot.unsqueeze(0))
+        self.confidence = 1.0 - label_smoothing
+
+    def forward(self, output, target):
+        """
+        output (FloatTensor): batch_size x n_classes
+        target (LongTensor): batch_size
+        """
+        model_prob = self.one_hot.repeat(target.size(0), 1)
+        model_prob.scatter_(1, target.unsqueeze(1), self.confidence)
+        model_prob.masked_fill_((target == self.padding_idx).unsqueeze(1), 0)
+
+        return F.kl_div(output, model_prob, reduction='sum')
+
+
+class NMTLossCompute(nn.Module):
+    """
+    Standard NMT Loss Computation.
+    """
+
+    def __init__(self, generator, symbols, vocab_size, label_smoothing=0.0):
+        super().__init__()
+        self.generator = generator
+        self.padding_idx = symbols['PAD']
+        if label_smoothing > 0:
+            self.criterion = LabelSmoothingLoss(
+                label_smoothing, vocab_size, ignore_index=self.padding_idx)
+        else:
+            self.criterion = nn.NLLLoss(
+                ignore_index=self.padding_idx, reduction='sum')
+
+    def _bottle(self, _v):
+        return _v.view(-1, _v.size(2))
+
+    def _unbottle(self, _v, batch_size):
+        return _v.view(-1, batch_size, _v.size(1))
+
+    def forward(self, tgt, output):
+        target = tgt[:, 1:]
+        normalization = target.ne(self.padding_idx).sum()
+        bottled_output = self._bottle(output)
+        scores = self.generator(bottled_output)
+        gtruth = target.contiguous().view(-1)
+        loss = self.criterion(scores, gtruth)
+        loss.div(float(normalization))
+        return loss
+
+
+class PalmForConditionalGeneration(PalmPreTrainedModel):
+
+    def __init__(self, config, checkpoint=None):
+        super().__init__(config)
+        self.config = config
+        if config.encoder == 'roberta':
+            tokenizer = RobertaTokenizer.from_pretrained(
+                config.encoder_pth, do_lower_case=False)
+            symbols = {
+                'BOS': tokenizer.cls_token_id,
+                'EOS': tokenizer.sep_token_id,
+                'PAD': tokenizer.pad_token_id,
+                'EOQ': tokenizer.unk_token_id
+            }
+        elif config.encoder == 'bert' or config.encoder == 'zh_bert':
+            tokenizer = BertTokenizer.from_pretrained(
+                config.encoder_pth, do_lower_case=True)
+            symbols = {
+                'BOS': tokenizer.vocab['[CLS]'],
+                'EOS': tokenizer.vocab['[SEP]'],
+                'PAD': tokenizer.vocab['[PAD]'],
+                'EOQ': tokenizer.vocab['[unused2]']
+            }
+        self.tokenizer = tokenizer
+        self.symbols = symbols
+        self.palm = AbsSummarizer(config, checkpoint)
+        self.loss = NMTLossCompute(self.palm.generator, symbols,
+                                   self.palm.vocab_size,
+                                   config.label_smoothing)
+
+    def forward(self, src, tgt, mask_src):
+        output = self.palm(src, tgt, mask_src)[0]
+        loss = self.loss(tgt, output)
+        return loss
+
+
+class Translator(nn.Module):
+    """
+    Uses a model to translate a batch of sentences.
+    """
+
+    @dataclass
+    class Batch:
+        batch_size: int
+        src: torch.Tensor
+        tgt: torch.Tensor
+        mask_src: torch.Tensor
+        query_id: List[None] = None
+        src_str: List[List[str]] = None
+        tgt_str: List[str] = None
+
+    def __init__(self,
+                 model: PalmForConditionalGeneration,
+                 dataset: str = 'cnn'):
+        super().__init__()
+        self.logger = logging.get_logger(__name__)
+        self.args = model.config
+        self.args.dataset = dataset
+        self.model = model.palm
+        self.generator = self.model.generator
+        self.vocab = model.tokenizer
+        self.symbols = model.symbols
+        self.start_token = self.symbols['BOS']
+        self.end_token = self.symbols['EOS']
+        self.alpha = self.args.alpha
+        self.beam_size = self.args.beam_size
+        self.min_length = self.args.min_length
+        self.max_length = self.args.max_length
+
+    def from_batch(self, translation_batch):
+        batch = translation_batch['batch']
+        assert (len(translation_batch['gold_score']) == len(
+            translation_batch['predictions']))
+        batch_size = batch.batch_size
+
+        preds, pred_score, _, tgt_str, src, src_str = \
+            translation_batch['predictions'], translation_batch['scores'], translation_batch['gold_score'], \
+            batch.tgt_str, batch.src, batch.src_str
+        query_id = batch.query_id
+        '''
+        try:
+            query_id = batch.query_id
+        except:
+            query_id = None
+        '''
+        translations = []
+        for b in range(batch_size):
+            if self.args.dataset == 'qg_ranking_test':
+                if self.args.encoder == 'bert' or self.args.encoder == 'zh_bert':
+                    pred_sents = [
+                        ' '.join(
+                            self.vocab.convert_ids_to_tokens(
+                                [int(n) for n in each])).replace(' ##', '')
+                        for each in preds[b]
+                    ]
+                elif self.args.encoder == 'roberta':
+                    pred_sents = [
+                        self.vocab.decode([int(n) for n in each
+                                           ]).replace('<s>',
+                                                      '').replace('</s>', '')
+                        for each in preds[b]
+                    ]
+            elif self.args.encoder == 'roberta':
+                pred_sents = self.vocab.decode([int(n)
+                                                for n in preds[b][0]]).replace(
+                                                    '<s>',
+                                                    '').replace('</s>', '')
+            elif self.args.encoder == 'bert':
+                pred_sents = self.vocab.convert_ids_to_tokens(
+                    [int(n) for n in preds[b][0]])
+                pred_sents = ' '.join(pred_sents).replace(' ##', '')
+            elif self.args.encoder == 'zh_bert' and self.args.dataset == 'paraphrase':
+                pred_sents = [
+                    self.vocab.convert_ids_to_tokens([int(n) for n in pred])
+                    for pred in preds[b]
+                ]
+                pred_sents = [
+                    ''.join(pred).replace(' ##', '') for pred in pred_sents
+                ]
+            elif self.args.encoder == 'zh_bert':
+                pred_sents = self.vocab.convert_ids_to_tokens(
+                    [int(n) for n in preds[b][0]])
+                pred_sents = ''.join(pred_sents).replace('##', '')
+            gold_sent = tgt_str[b]
+
+            if self.args.encoder == 'roberta':
+                raw_src = self.vocab.decode([int(t) for t in src[b]])
+                raw_src = ' '.join(src_str[b])
+            else:
+                raw_src = [self.vocab.ids_to_tokens[int(t)]
+                           for t in src[b]][:500]
+                raw_src = ' '.join(raw_src)
+            if self.args.dataset == 'faq':
+                translation = (pred_sents, gold_sent, src_str[b], query_id[b],
+                               pred_score[b])
+            else:
+                translation = (pred_sents, gold_sent, raw_src, query_id[b],
+                               pred_score[b])
+            # translation = (pred_sents[0], gold_sent)
+            translations.append(translation)
+
+        return translations
+
+    def translate(self, data_iter, step):
+        gold_path = self.args.result_path + '.%d.gold' % step
+        can_path = self.args.result_path + '.%d.candidate' % step
+        self.gold_out_file = codecs.open(gold_path, 'w', 'utf-8')
+        self.can_out_file = codecs.open(can_path, 'w', 'utf-8')
+        self.pred_json_score_out_file = codecs.open(can_path + '.sample', 'w',
+                                                    'utf-8')
+        if self.args.dataset == 'paraphrase' and self.args.encoder == 'roberta':
+            out = '\t'.join([
+                'query_id', 'source_query', 'target_query', 'predict_query'
+            ]) + '\n'
+            self.pred_json_score_out_file.write(out)
+
+        raw_src_path = self.args.result_path + '.%d.raw_src' % step
+        self.src_out_file = codecs.open(raw_src_path, 'w', 'utf-8')
+
+        pred_results, gold_results = [], []
+        cnt = 0
+        pred_dict, ref_dict = {}, {}
+        for i, batch in enumerate(data_iter):
+            self.logger.info(f'data: {i + 1} / {len(data_iter)}')
+            batch_data = self.translate_batch(batch)
+            translations = self.from_batch(batch_data)
+
+            for trans in translations:
+                pred, gold, src, query_id, pred_score = trans
+                src = src.replace('<pad>', '').replace('##', '').strip()
+                if self.args.dataset == 'qg_ranking_test':
+                    pred_str = '\t'.join([
+                        each.replace('[unused0]', '').replace(
+                            '[PAD]', '').replace('[unused1]', '').replace(
+                                r' +', ' ').replace('[SEP]', '').replace(
+                                    '[unused2]',
+                                    '').replace(r' +', ' ').replace(
+                                        '<mask>',
+                                        '<q>').replace('<pad>', '').replace(
+                                            '<s>',
+                                            '').replace('</s>', '').replace(
+                                                '<unk>', ' ').strip()
+                        for each in pred
+                    ])
+                else:
+                    pred_str = pred.replace('[unused0]', '').replace(
+                        '[PAD]', '').replace('[unused1]', '').replace(
+                            r' +', ' ').replace('[SEP]', '').replace(
+                                '[unused2]', '').replace('[CLS]', '').replace(
+                                    '[SEP]', '').replace('[UNK]', '').strip()
+                    pred_str = pred_str.replace(r' +', ' ').replace(
+                        '<mask>',
+                        '<q>').replace('<pad>', '').replace('<s>', '').replace(
+                            '</s>', '').replace('<unk>', ' ').strip()
+                gold_str = gold.replace('<mask>', '<q>').strip().replace(
+                    '[UNK]', '').replace('[unused1]', '').replace(
+                        '[unused2]',
+                        '').replace('##', '').replace('[CLS]', '').replace(
+                            '[SEP]', '').strip().replace('<s>', '').replace(
+                                '</s>', '').replace('<unk>', ' ').strip()
+                if (self.args.recall_eval):
+                    _pred_str = ''
+                    # gap = 1e3
+                    for sent in pred_str.split('<q>'):
+                        can_pred_str = _pred_str + '<q>' + sent.strip()
+                        # can_gap = math.fabs(len(_pred_str.split()) - len(gold_str.split()))
+                        # if(can_gap>=gap):
+                        if len(can_pred_str.split()) >= len(
+                                gold_str.split()) + 10:
+                            pred_str = _pred_str
+                            break
+                        else:
+                            # gap = can_gap
+                            _pred_str = can_pred_str
+
+                if self.args.dataset == 'marco' or self.args.dataset == 'squad' or self.args.dataset == 'qg_ranking':
+                    pred_str = pred_str.replace('<q>', ' ')
+                    if query_id is not None:
+                        pred_json = {
+                            'query_id': query_id,
+                            'answers': [pred_str]
+                        }
+                        gold_json = {
+                            'query_id': query_id,
+                            'answers': [gold_str]
+                        }
+                        pred_json_score = {
+                            'query_id': query_id,
+                            'answers': [pred_str],
+                            'scores': pred_score[0].cpu().numpy().tolist()
+                        }
+                    else:
+                        pred_json = {'query_id': cnt, 'answers': [pred_str]}
+                        gold_json = {'query_id': cnt, 'answers': [gold_str]}
+                        pred_json_score = {
+                            'query_id': cnt,
+                            'answers': [pred_str],
+                            'scores': pred_score[0].cpu().numpy().tolist()
+                        }
+                    json.dump(pred_json, self.can_out_file)
+                    self.can_out_file.write('\n')
+                    json.dump(gold_json, self.gold_out_file)
+                    self.gold_out_file.write('\n')
+                    json.dump(pred_json_score, self.pred_json_score_out_file)
+                    self.pred_json_score_out_file.write('\n')
+                    self.src_out_file.write(src.strip() + '\n')
+                elif self.args.dataset == 'cnn':
+                    self.can_out_file.write(pred_str + '\n')
+                    self.gold_out_file.write(gold_str + '\n')
+                    self.src_out_file.write(src.strip() + '\n')
+                elif self.args.dataset == 'dureader':
+                    if query_id is None:
+                        query_id = str(cnt)
+                    pred_results.extend(normalize([pred_str]))
+                    gold_results.extend(normalize([gold_str]))
+                    self.can_out_file.write(pred_str + '\n')
+                    self.gold_out_file.write('\t'.join([src[0], gold_str])
+                                             + '\n')
+
+                elif self.args.dataset == 'paraphrase':
+                    if query_id is None:
+                        query_id = str(cnt)
+                    if self.args.encoder == 'roberta':
+                        pred_str = [pred_str]
+                    pred_dict[query_id] = normalize([pred_str[0]])
+                    ref_dict[query_id] = normalize([gold_str])
+                    # pred_str_list = [src] + pred_str
+                    # self.can_out_file.write("\t".join(pred_str_list)+"\n")
+                    # self.can_out_file.write("\t".join(pred_str_list)+"\n")
+                    # self.gold_out_file.write("\t".join([src, pred_str[0], gold_str])+"\n")
+                    self.pred_json_score_out_file.write(
+                        '\t'.join([str(query_id), src, gold_str, pred_str[0]])
+                        + '\n')
+                elif self.args.dataset == 'faq':
+                    if pred_score[0].cpu().numpy().tolist() < -3.5:
+                        continue
+                    self.can_out_file.write(
+                        '\t'.join([str(query_id), src, pred_str]) + '\n')
+                    self.gold_out_file.write(
+                        '\t'.join([str(query_id), src, gold_str]) + '\n')
+                    # passage, answer, question, score
+                    self.pred_json_score_out_file.write('\t'.join([
+                        str(query_id), gold_str, src, pred_str,
+                        str(pred_score[0].cpu().numpy().tolist())
+                    ]) + '\n')
+                elif self.args.dataset == 'qg_ranking_test':
+                    self.can_out_file.write(
+                        str(query_id) + '\t' + pred_str + '\n')
+
+                cnt += 1
+            self.can_out_file.flush()
+            self.gold_out_file.flush()
+            self.src_out_file.flush()
+        self.logger.info('cnt: %s' % cnt)
+        self.can_out_file.close()
+        self.gold_out_file.close()
+        self.src_out_file.close()
+
+        if (step != -1):
+            if self.args.dataset == 'marco' or self.args.dataset == 'squad' or self.args.dataset == 'qg_ranking':
+                cnn_results = subprocess.getoutput(
+                    './run.sh %s %s' % (gold_path, can_path))  # run.sh ...
+                self.logger.info(cnn_results)
+            elif self.args.dataset == 'cnn':
+                self.logger.info('Calculating Rouge')
+                from rouge import Rouge
+                candidates = [
+                    line.strip() for line in open(can_path, encoding='utf-8')
+                ]
+                references = [
+                    line.strip() for line in open(gold_path, encoding='utf-8')
+                ]
+                rouge_score = Rouge().get_scores(
+                    candidates, references, avg=True)
+                # self.logger.info('Rouges at step %d \n%s' % (step, rouge_results_to_str(rouges)))
+                print(rouge_score)
+            elif self.args.dataset == 'dureader' or self.args.dataset == 'paraphrase':
+
+                def postprocess_text(preds, labels):
+                    preds = [pred.strip().replace('.', '') for pred in preds]
+                    labels = [label.strip() for label in labels]
+                    while '' in preds:
+                        idx = preds.index('')
+                        preds[idx] = '。'
+                    return preds, labels
+
+                # bleu_rouge = compute_bleu_rouge(pred_dict, ref_dict)
+                # self.logger.info('Dev eval result: {}'.format(bleu_rouge))
+                pred_results, gold_results = postprocess_text(
+                    pred_results, gold_results)
+                pred_dict = {str(i): tmp for i, tmp in enumerate(pred_results)}
+                gold_dict = {str(i): tmp for i, tmp in enumerate(gold_results)}
+                bleu_rouge = compute_bleu_rouge(pred_dict, gold_dict)
+                print(bleu_rouge)
+            # unreachable
+            elif self.args.dataset == 'dureader' or self.args.dataset == 'paraphrase':
+                # bleu_rouge = compute_bleu_rouge(pred_dict, ref_dict)
+                # self.logger.info('Dev eval result: {}'.format(bleu_rouge))
+                pred_results, gold_results = postprocess_text(
+                    pred_results, gold_results)
+                bleu_score = cal_bleu(pred_results, gold_results)
+                from rouge import Rouge
+                rouge = Rouge()
+                rouge_score = rouge.get_scores(
+                    pred_results, gold_results, avg=True)
+                print("'Dev eval result: Bleu-4={}, {}".format(
+                    bleu_score, rouge_score))
+
+    def translate_batch(self, batch: 'Batch', fast: bool = False):
+        """
+        Translate a batch of sentences.
+
+        Mostly a wrapper around :obj:`Beam`.
+
+        Args:
+           batch (:obj:`Batch`): a batch from a dataset object
+           data (:obj:`Dataset`): the dataset object
+           fast (bool): enables fast beam search (may not support all features)
+
+        Todo:
+           Shouldn't need the original dataset.
+        """
+        self.model.eval()
+        with torch.no_grad():
+            return self._fast_translate_batch(
+                batch, self.max_length, min_length=self.min_length)
+
+    def _tile(self, x, count, dim=0):
+        perm = list(range(len(x.size())))
+        if dim != 0:
+            perm[0], perm[dim] = perm[dim], perm[0]
+            x = x.permute(perm).contiguous()
+        out_size = list(x.size())
+        out_size[0] *= count
+        batch = x.size(0)
+        x = x.view(batch, -1) \
+            .transpose(0, 1) \
+            .repeat(count, 1) \
+            .transpose(0, 1) \
+            .contiguous() \
+            .view(*out_size)
+        if dim != 0:
+            x = x.permute(perm).contiguous()
+        return x
+
+    def _top_k_top_p_filtering(self,
+                               logits,
+                               top_k=10,
+                               top_p=1.0,
+                               filter_value=-float('Inf'),
+                               min_tokens_to_keep=1):
+        if top_k > 0:
+            top_k = min(max(top_k, min_tokens_to_keep),
+                        logits.size(-1))  # Safety check
+            # Remove all tokens with a probability less than the last token of the top-k
+            indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1,
+                                                                      None]
+            logits[indices_to_remove] = filter_value
+
+        if top_p < 1.0:
+            sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+            cumulative_probs = torch.cumsum(
+                F.softmax(sorted_logits, dim=-1), dim=-1)
+
+            # Remove tokens with cumulative probability above the threshold (token with 0 are kept)
+            sorted_indices_to_remove = cumulative_probs > top_p
+            if min_tokens_to_keep > 1:
+                # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below)
+                sorted_indices_to_remove[..., :min_tokens_to_keep] = 0
+            # Shift the indices to the right to keep also the first token above the threshold
+            sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
+                ..., :-1].clone()
+            sorted_indices_to_remove[..., 0] = 0
+
+            # scatter sorted tensors to original indexing
+            indices_to_remove = sorted_indices_to_remove.scatter(
+                1, sorted_indices, sorted_indices_to_remove)
+            logits[indices_to_remove] = filter_value
+        return logits
+
+    def _fast_translate_batch(self,
+                              batch: 'Batch',
+                              max_length: int,
+                              min_length: int = 0):
+        # TODO: faster code path for beam_size == 1.
+        # TODO: support these blacklisted features.
+
+        beam_size = self.beam_size
+        batch_size = batch.batch_size
+        src = batch.src
+        mask_src = batch.mask_src
+
+        src_features, _ = self.model.bert(src, mask_src, return_dict=False)
+        self.model.decoder.init_state(src, with_cache=True)
+        device = src_features.device
+
+        # Tile states and memory beam_size times.
+        self.model.decoder.state.map_batch_fn(
+            lambda state, dim: self._tile(state, beam_size, dim=dim))
+        src_features = self._tile(src_features, beam_size, dim=0)
+        batch_offset = torch.arange(
+            batch_size, dtype=torch.long, device=device)
+        beam_offset = torch.arange(
+            0,
+            batch_size * beam_size,
+            step=beam_size,
+            dtype=torch.long,
+            device=device)
+        alive_seq = torch.full([batch_size * beam_size, 1],
+                               self.start_token,
+                               dtype=torch.long,
+                               device=device)
+
+        # Give full probability to the first beam on the first step.
+        topk_log_probs = (
+            torch.tensor(
+                [0.0] + [float('-inf')] * (beam_size - 1),
+                device=device).repeat(batch_size))
+
+        # Structure that holds finished hypotheses.
+        hypotheses = [[] for _ in range(batch_size)]  # noqa: F812
+
+        results = {}
+        results['predictions'] = [[] for _ in range(batch_size)]  # noqa: F812
+        results['scores'] = [[] for _ in range(batch_size)]  # noqa: F812
+        results['gold_score'] = [0] * batch_size
+        results['batch'] = batch
+
+        for step in range(max_length):
+            self.logger.info(f'step: {step + 1} / {max_length}')
+            decoder_input = alive_seq[:, -1].view(1, -1)
+
+            # Decoder forward.
+            decoder_input = decoder_input.transpose(0, 1)
+            dec_out, attns = self.model.decoder(
+                decoder_input, src_features, step=step)
+
+            # Generator forward.
+            log_probs = self.generator.forward(
+                dec_out.transpose(0, 1).squeeze(0))
+            vocab_size = log_probs.size(-1)
+
+            if step < min_length:
+                log_probs[:, self.end_token] = -1e20
+
+            # Multiply probs by the beam probability.
+
+            length_penalty = ((5.0 + (step + 1)) / 6.0)**self.alpha
+            # '''
+            if self.args.sample_topk:
+                temperature = self.args.temperature
+                _scores = log_probs / temperature
+                _scores = self._top_k_top_p_filtering(
+                    _scores,
+                    top_k=self.args.top_k,
+                    top_p=self.args.top_p,
+                    min_tokens_to_keep=1
+                )  # (batch_size * num_beams, vocab_size)
+                # Sample 2 next words for each beam (so we have some spare tokens
+                # and match output of greedy beam search)
+                topk_ids = torch.multinomial(
+                    F.softmax(_scores, dim=-1),
+                    num_samples=1)  # (batch_size * num_beams, 2)
+                # Compute next scores
+                _scores = F.log_softmax(
+                    _scores, dim=1)  # (batch_size * num_beams, vocab_size)
+
+                _scores += topk_log_probs.view(-1).unsqueeze(1)
+                _scores = _scores / length_penalty
+                topk_scores = torch.gather(
+                    _scores, -1, topk_ids)  # (batch_size * num_beams, 2)
+                # log_probs +=   # (batch_size * num_beams, 2)
+                # Match shape of greedy beam search
+                topk_ids = topk_ids.view(
+                    -1, beam_size)  # (batch_size, 2 * num_beams)
+                topk_scores = topk_scores.view(
+                    -1, beam_size)  # (batch_size, 2 * num_beams)
+            # '''
+            else:
+                log_probs += topk_log_probs.view(-1).unsqueeze(1)
+                curr_scores = log_probs / length_penalty
+
+                curr_scores = curr_scores.reshape(-1, beam_size * vocab_size)
+                topk_scores, topk_ids = curr_scores.topk(beam_size, dim=-1)
+            if self.args.block_trigram:
+                cur_len = alive_seq.size(1)
+                if cur_len > 3:
+                    for i in range(alive_seq.size(0)):
+                        fail = False
+                        words = [int(w) for w in alive_seq[i]]
+                        if self.args.encoder == 'roberta':
+                            # words = [self.vocab.convert_ids_to_tokens[w] for w in words]
+                            words = self.vocab.decode(words).strip().split()
+                        else:
+                            words = [
+                                self.vocab.ids_to_tokens[w] for w in words
+                            ]
+                            words = ' '.join(words).replace(' ##', '').split()
+                        if len(words) <= 3:
+                            continue
+                        trigrams = [(words[i - 1], words[i], words[i + 1])
+                                    for i in range(1,
+                                                   len(words) - 1)]
+                        trigram = tuple(trigrams[-1])
+                        if trigram in trigrams[:-1]:
+                            fail = True
+                        if fail:
+                            curr_scores[i] = -10e20
+            # Recover log probs.
+            topk_log_probs = topk_scores * length_penalty
+
+            # Resolve beam origin and true word ids.
+            # topk_beam_index = topk_ids.div(vocab_size)
+            topk_beam_index = topk_ids // vocab_size
+            topk_ids = topk_ids.fmod(vocab_size)
+
+            # Map beam_index to batch_index in the flat representation.
+            batch_index = (
+                topk_beam_index
+                + beam_offset[:topk_beam_index.size(0)].unsqueeze(1))
+            select_indices = batch_index.view(-1)
+
+            # Append last prediction.
+            alive_seq = torch.cat([
+                alive_seq.index_select(0, select_indices),
+                topk_ids.view(-1, 1)
+            ], -1)
+
+            is_finished = topk_ids.eq(self.end_token)
+            if step + 1 == max_length:
+                is_finished.fill_(self.end_token)
+            # End condition is top beam is finished.
+            end_condition = is_finished[:, 0].eq(1)
+            # Save finished hypotheses.
+            if is_finished.any():
+                predictions = alive_seq.view(-1, beam_size, alive_seq.size(-1))
+                for i in range(is_finished.size(0)):
+                    b = batch_offset[i]
+                    if end_condition[i]:
+                        is_finished[i].fill_(self.end_token)
+                    finished_hyp = is_finished[i].nonzero().view(-1)
+                    # Store finished hypotheses for this batch.
+                    for j in finished_hyp:
+                        hypotheses[b].append(
+                            (topk_scores[i, j], predictions[i, j, 1:]))
+                    # If the batch reached the end, save the n_best hypotheses.
+                    if end_condition[i]:
+                        best_hyp = sorted(
+                            hypotheses[b], key=lambda x: x[0], reverse=True)
+                        if self.args.dataset == 'qg_ranking_test' or (
+                                self.args.dataset == 'paraphrase'
+                                and not self.args.sample_topk):
+                            for each in best_hyp[:beam_size]:
+                                score, pred = each
+                                results['scores'][b].append(score)
+                                results['predictions'][b].append(pred)
+                        else:
+                            score, pred = best_hyp[0]
+                            results['scores'][b].append(score)
+                            results['predictions'][b].append(pred)
+                non_finished = end_condition.eq(0).nonzero().view(-1)
+                # If all sentences are translated, no need to go further.
+                if len(non_finished) == 0:
+                    break
+                # Remove finished batches for the next step.
+                topk_log_probs = topk_log_probs.index_select(0, non_finished)
+                batch_index = batch_index.index_select(0, non_finished)
+                batch_offset = batch_offset.index_select(0, non_finished)
+                alive_seq = predictions.index_select(0, non_finished) \
+                    .view(-1, alive_seq.size(-1))
+            # Reorder states.
+            select_indices = batch_index.view(-1)
+            src_features = src_features.index_select(0, select_indices)
+            self.model.decoder.state.map_batch_fn(
+                lambda state, dim: state.index_select(dim, select_indices))
+
+        return results
+
+    def forward(self, input_ids: torch.Tensor,
+                attention_mask: torch.Tensor) -> Dict[str, torch.Tensor]:
+        batch = self.Batch(
+            batch_size=input_ids.size()[0],
+            src=input_ids,
+            tgt=None,
+            mask_src=attention_mask)
+        translation_batch = self.translate_batch(batch)
+
+        preds = translation_batch['predictions']
+        return {'predictions': preds}
diff --git a/modelscope/models/nlp/palm_for_text_generation.py b/modelscope/models/nlp/palm_v2/palm_for_text_generation.py
similarity index 96%
rename from modelscope/models/nlp/palm_for_text_generation.py
rename to modelscope/models/nlp/palm_v2/palm_for_text_generation.py
index 23d60663..7f8e918b 100644
--- a/modelscope/models/nlp/palm_for_text_generation.py
+++ b/modelscope/models/nlp/palm_v2/palm_for_text_generation.py
@@ -22,8 +22,8 @@ class PalmForTextGeneration(TorchModel):
         """
         super().__init__(model_dir, *args, **kwargs)
 
-        from sofa.models.palm_v2 import (PalmForConditionalGeneration,
-                                         Translator)
+        from modelscope.models.nlp.palm_v2 import (
+            PalmForConditionalGeneration, Translator)
         self.model = PalmForConditionalGeneration.from_pretrained(model_dir)
         self.tokenizer = self.model.tokenizer
         self.generator = Translator(self.model)
diff --git a/modelscope/models/nlp/sbert_for_nli.py b/modelscope/models/nlp/sbert_for_nli.py
deleted file mode 100644
index ea62a8bd..00000000
--- a/modelscope/models/nlp/sbert_for_nli.py
+++ /dev/null
@@ -1,23 +0,0 @@
-from modelscope.metainfo import Models
-from modelscope.models.builder import MODELS
-from modelscope.utils.constant import Tasks
-from .sbert_for_sequence_classification import \
-    SbertForSequenceClassificationBase
-
-__all__ = ['SbertForNLI']
-
-
-@MODELS.register_module(Tasks.nli, module_name=Models.structbert)
-class SbertForNLI(SbertForSequenceClassificationBase):
-
-    def __init__(self, model_dir: str, *args, **kwargs):
-        """initialize the text generation model from the `model_dir` path.
-
-        Args:
-            model_dir (str): the model path.
-            model_cls (Optional[Any], optional): model loader, if None, use the
-                default loader to load model weights, by default None.
-        """
-        super().__init__(
-            model_dir, *args, model_args={'num_labels': 3}, **kwargs)
-        assert self.model.config.num_labels == 3
diff --git a/modelscope/models/nlp/sbert_for_sentence_similarity.py b/modelscope/models/nlp/sbert_for_sentence_similarity.py
deleted file mode 100644
index 00b612ea..00000000
--- a/modelscope/models/nlp/sbert_for_sentence_similarity.py
+++ /dev/null
@@ -1,25 +0,0 @@
-from modelscope.metainfo import Models
-from modelscope.models.builder import MODELS
-from modelscope.utils.constant import Tasks
-from .sbert_for_sequence_classification import \
-    SbertForSequenceClassificationBase
-
-__all__ = ['SbertForSentenceSimilarity']
-
-
-@MODELS.register_module(
-    Tasks.sentence_similarity, module_name=Models.structbert)
-class SbertForSentenceSimilarity(SbertForSequenceClassificationBase):
-
-    def __init__(self, model_dir: str, *args, **kwargs):
-        """initialize the sentence similarity model from the `model_dir` path.
-
-        Args:
-            model_dir (str): the model path.
-            model_cls (Optional[Any], optional): model loader, if None, use the
-                default loader to load model weights, by default None.
-        """
-        super().__init__(
-            model_dir, *args, model_args={'num_labels': 2}, **kwargs)
-        self.model_dir = model_dir
-        assert self.model.config.num_labels == 2
diff --git a/modelscope/models/nlp/sbert_for_sentiment_classification.py b/modelscope/models/nlp/sbert_for_sentiment_classification.py
deleted file mode 100644
index 83ac93c5..00000000
--- a/modelscope/models/nlp/sbert_for_sentiment_classification.py
+++ /dev/null
@@ -1,22 +0,0 @@
-from modelscope.metainfo import Models
-from modelscope.models.builder import MODELS
-from modelscope.utils.constant import Tasks
-from .sbert_for_sequence_classification import \
-    SbertForSequenceClassificationBase
-
-__all__ = ['SbertForSentimentClassification']
-
-
-@MODELS.register_module(
-    Tasks.sentiment_classification, module_name=Models.structbert)
-class SbertForSentimentClassification(SbertForSequenceClassificationBase):
-
-    def __init__(self, model_dir: str, *args, **kwargs):
-        """initialize the text generation model from the `model_dir` path.
-
-        Args:
-            model_dir (str): the model path.
-        """
-        super().__init__(
-            model_dir, *args, model_args={'num_labels': 2}, **kwargs)
-        assert self.model.config.num_labels == 2
diff --git a/modelscope/models/nlp/sbert_for_sequence_classification.py b/modelscope/models/nlp/sbert_for_sequence_classification.py
deleted file mode 100644
index 59fcf6fa..00000000
--- a/modelscope/models/nlp/sbert_for_sequence_classification.py
+++ /dev/null
@@ -1,82 +0,0 @@
-import os
-from typing import Any, Dict
-
-import json
-import numpy as np
-import torch
-from sofa.models.sbert.modeling_sbert import SbertModel, SbertPreTrainedModel
-from torch import nn
-
-from modelscope.models import TorchModel
-
-
-class SbertTextClassfier(SbertPreTrainedModel):
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.config = config
-        self.encoder = SbertModel(config, add_pooling_layer=True)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
-
-    def forward(self,
-                input_ids=None,
-                token_type_ids=None,
-                labels=None,
-                **kwargs):
-        outputs = self.encoder(
-            input_ids,
-            token_type_ids=token_type_ids,
-            return_dict=None,
-        )
-        pooled_output = outputs[1]
-        pooled_output = self.dropout(pooled_output)
-        logits = self.classifier(pooled_output)
-        if labels is not None:
-            loss_fct = nn.CrossEntropyLoss()
-            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            return {'logits': logits, 'loss': loss}
-        return {'logits': logits}
-
-    def build(**kwags):
-        return SbertTextClassfier.from_pretrained(model_dir, **model_args)
-
-
-class SbertForSequenceClassificationBase(TorchModel):
-
-    def __init__(self, model_dir: str, model_args=None, *args, **kwargs):
-        super().__init__(model_dir, *args, **kwargs)
-        if model_args is None:
-            model_args = {}
-        self.model = SbertTextClassfier.from_pretrained(
-            model_dir, **model_args)
-        self.id2label = {}
-        self.label_path = os.path.join(self.model_dir, 'label_mapping.json')
-        if os.path.exists(self.label_path):
-            with open(self.label_path) as f:
-                self.label_mapping = json.load(f)
-            self.id2label = {
-                idx: name
-                for name, idx in self.label_mapping.items()
-            }
-
-    def train(self):
-        return self.model.train()
-
-    def eval(self):
-        return self.model.eval()
-
-    def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]:
-        input_ids = torch.tensor(input['input_ids'], dtype=torch.long)
-        token_type_ids = torch.tensor(
-            input['token_type_ids'], dtype=torch.long)
-        return self.model.forward(input_ids, token_type_ids)
-
-    def postprocess(self, input, **kwargs):
-        logits = input['logits']
-        probs = logits.softmax(-1).cpu().numpy()
-        pred = logits.argmax(-1).cpu().numpy()
-        logits = logits.cpu().numpy()
-        res = {'predictions': pred, 'probabilities': probs, 'logits': logits}
-        return res
diff --git a/modelscope/models/nlp/sbert_for_token_classification.py b/modelscope/models/nlp/sbert_for_token_classification.py
deleted file mode 100644
index 748c4107..00000000
--- a/modelscope/models/nlp/sbert_for_token_classification.py
+++ /dev/null
@@ -1,64 +0,0 @@
-from typing import Any, Dict, Union
-
-import numpy as np
-import torch
-
-from modelscope.metainfo import Models
-from modelscope.models import TorchModel
-from modelscope.models.base import Tensor
-from modelscope.models.builder import MODELS
-from modelscope.utils.constant import Tasks
-
-__all__ = ['SbertForTokenClassification']
-
-
-@MODELS.register_module(Tasks.word_segmentation, module_name=Models.structbert)
-class SbertForTokenClassification(TorchModel):
-
-    def __init__(self, model_dir: str, *args, **kwargs):
-        """initialize the word segmentation model from the `model_dir` path.
-
-        Args:
-            model_dir (str): the model path.
-            model_cls (Optional[Any], optional): model loader, if None, use the
-                default loader to load model weights, by default None.
-        """
-        super().__init__(model_dir, *args, **kwargs)
-        self.model_dir = model_dir
-        import sofa
-        self.model = sofa.SbertForTokenClassification.from_pretrained(
-            self.model_dir)
-        self.config = sofa.SbertConfig.from_pretrained(self.model_dir)
-
-    def train(self):
-        return self.model.train()
-
-    def eval(self):
-        return self.model.eval()
-
-    def forward(self, input: Dict[str,
-                                  Any]) -> Dict[str, Union[str, np.ndarray]]:
-        """return the result by the model
-
-        Args:
-            input (Dict[str, Any]): the preprocessed data
-
-        Returns:
-            Dict[str, Union[str,np.ndarray]]: results
-                Example:
-                    {
-                        'predictions': array([1,4]), # lable 0-negative 1-positive
-                        'logits': array([[-0.53860897,  1.5029076 ]], dtype=float32) # true value
-                        'text': str(今天),
-                    }
-        """
-        input_ids = torch.tensor(input['input_ids']).unsqueeze(0)
-        return {**self.model(input_ids), 'text': input['text']}
-
-    def postprocess(self, input: Dict[str, Tensor],
-                    **kwargs) -> Dict[str, Tensor]:
-        logits = input['logits']
-        pred = torch.argmax(logits[0], dim=-1)
-        pred = pred.cpu().numpy()
-        rst = {'predictions': pred, 'logits': logits, 'text': input['text']}
-        return rst
diff --git a/modelscope/models/nlp/sbert_for_zero_shot_classification.py b/modelscope/models/nlp/sbert_for_zero_shot_classification.py
deleted file mode 100644
index b772cf45..00000000
--- a/modelscope/models/nlp/sbert_for_zero_shot_classification.py
+++ /dev/null
@@ -1,50 +0,0 @@
-from typing import Any, Dict
-
-import numpy as np
-
-from modelscope.metainfo import Models
-from modelscope.models import TorchModel
-from modelscope.models.builder import MODELS
-from modelscope.utils.constant import Tasks
-
-__all__ = ['SbertForZeroShotClassification']
-
-
-@MODELS.register_module(
-    Tasks.zero_shot_classification, module_name=Models.structbert)
-class SbertForZeroShotClassification(TorchModel):
-
-    def __init__(self, model_dir: str, *args, **kwargs):
-        """initialize the zero shot classification model from the `model_dir` path.
-
-        Args:
-            model_dir (str): the model path.
-        """
-
-        super().__init__(model_dir, *args, **kwargs)
-        from sofa import SbertForSequenceClassification
-        self.model = SbertForSequenceClassification.from_pretrained(model_dir)
-
-    def train(self):
-        return self.model.train()
-
-    def eval(self):
-        return self.model.eval()
-
-    def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]:
-        """return the result by the model
-
-        Args:
-            input (Dict[str, Any]): the preprocessed data
-
-        Returns:
-            Dict[str, np.ndarray]: results
-                Example:
-                    {
-                        'logits': array([[-0.53860897,  1.5029076 ]], dtype=float32) # true value
-                    }
-        """
-        outputs = self.model(**input)
-        logits = outputs['logits'].cpu().numpy()
-        res = {'logits': logits}
-        return res
diff --git a/modelscope/models/nlp/sequence_classification.py b/modelscope/models/nlp/sequence_classification.py
index 4920c6ff..5550d749 100644
--- a/modelscope/models/nlp/sequence_classification.py
+++ b/modelscope/models/nlp/sequence_classification.py
@@ -1,85 +1,174 @@
-import os
-from typing import Any, Dict
+from abc import abstractmethod
 
-import json
-import numpy as np
+from torch import nn
 
-from modelscope.metainfo import TaskModels
+from modelscope.metainfo import Models
+from modelscope.models.base import TorchModel
 from modelscope.models.builder import MODELS
+from modelscope.models.nlp.structbert import SbertPreTrainedModel
+from modelscope.models.nlp.veco import \
+    VecoForSequenceClassification as VecoForSequenceClassificationTransform
 from modelscope.outputs import OutputKeys
 from modelscope.utils.constant import Tasks
-from .task_model import SingleBackboneTaskModelBase
+from modelscope.utils.hub import parse_label_mapping
+from modelscope.utils.tensor_utils import (torch_nested_detach,
+                                           torch_nested_numpify)
 
-__all__ = ['SequenceClassificationModel']
+__all__ = ['SbertForSequenceClassification', 'VecoForSequenceClassification']
 
 
-@MODELS.register_module(
-    Tasks.sentiment_classification, module_name=TaskModels.text_classification)
-@MODELS.register_module(
-    Tasks.text_classification, module_name=TaskModels.text_classification)
-class SequenceClassificationModel(SingleBackboneTaskModelBase):
+class SequenceClassificationBase(TorchModel):
+    base_model_prefix: str = 'bert'
+
+    def __init__(self, config, model_dir):
+        super().__init__(model_dir)
+        self.num_labels = config.num_labels
+        self.config = config
+        setattr(self, self.base_model_prefix, self.build_base_model())
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
 
-    def __init__(self, model_dir: str, *args, **kwargs):
-        """initialize the sequence classification model from the `model_dir` path.
+    @abstractmethod
+    def build_base_model(self):
+        """Build the backbone model.
 
-        Args:
-            model_dir (str): the model path.
+        Returns: the backbone instance.
         """
-        super().__init__(model_dir, *args, **kwargs)
-        if 'base_model_prefix' in kwargs:
-            self._base_model_prefix = kwargs['base_model_prefix']
-
-        backbone_cfg = self.cfg.backbone
-        head_cfg = self.cfg.head
-
-        # get the num_labels from label_mapping.json
-        self.id2label = {}
-        self.label_path = os.path.join(model_dir, 'label_mapping.json')
-        if os.path.exists(self.label_path):
-            with open(self.label_path) as f:
-                self.label_mapping = json.load(f)
-            self.id2label = {
-                idx: name
-                for name, idx in self.label_mapping.items()
-            }
-        head_cfg['num_labels'] = len(self.label_mapping)
-
-        self.build_backbone(backbone_cfg)
-        self.build_head(head_cfg)
-
-    def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]:
-        outputs = super().forward(input)
-        sequence_output, pooled_output = self.extract_backbone_outputs(outputs)
-        outputs = self.head.forward(pooled_output)
-        if 'labels' in input:
-            loss = self.compute_loss(outputs, input['labels'])
-            outputs.update(loss)
-        return outputs
-
-    def extract_logits(self, outputs):
-        return outputs[OutputKeys.LOGITS].cpu().detach()
-
-    def extract_backbone_outputs(self, outputs):
-        sequence_output = None
-        pooled_output = None
-        if hasattr(self.backbone, 'extract_sequence_outputs'):
-            sequence_output = self.backbone.extract_sequence_outputs(outputs)
-        if hasattr(self.backbone, 'extract_pooled_outputs'):
-            pooled_output = self.backbone.extract_pooled_outputs(outputs)
-        return sequence_output, pooled_output
-
-    def compute_loss(self, outputs, labels):
-        loss = self.head.compute_loss(outputs, labels)
-        return loss
+        pass
+
+    @property
+    def base_model(self):
+        return getattr(self, self.base_model_prefix)
+
+    def forward(self, **kwargs):
+        labels = None
+        if OutputKeys.LABEL in kwargs:
+            labels = kwargs.pop(OutputKeys.LABEL)
+        elif OutputKeys.LABELS in kwargs:
+            labels = kwargs.pop(OutputKeys.LABELS)
+
+        outputs = self.base_model.forward(**kwargs)
+
+        # backbone model should return pooled_output as its second output
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            return {OutputKeys.LOGITS: logits, OutputKeys.LOSS: loss}
+        return {OutputKeys.LOGITS: logits}
 
     def postprocess(self, input, **kwargs):
-        logits = self.extract_logits(input)
-        probs = logits.softmax(-1).numpy()
-        pred = logits.argmax(-1).numpy()
-        logits = logits.numpy()
+        logits = input[OutputKeys.LOGITS]
+        probs = torch_nested_numpify(torch_nested_detach(logits.softmax(-1)))
+        pred = torch_nested_numpify(torch_nested_detach(logits.argmax(-1)))
+        logits = torch_nested_numpify(torch_nested_detach(logits))
         res = {
             OutputKeys.PREDICTIONS: pred,
             OutputKeys.PROBABILITIES: probs,
             OutputKeys.LOGITS: logits
         }
         return res
+
+
+@MODELS.register_module(
+    Tasks.sentence_similarity, module_name=Models.structbert)
+@MODELS.register_module(
+    Tasks.sentiment_classification, module_name=Models.structbert)
+@MODELS.register_module(Tasks.nli, module_name=Models.structbert)
+@MODELS.register_module(
+    Tasks.zero_shot_classification, module_name=Models.structbert)
+class SbertForSequenceClassification(SequenceClassificationBase,
+                                     SbertPreTrainedModel):
+    base_model_prefix: str = 'bert'
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [r'position_ids']
+
+    def __init__(self, config, model_dir):
+        if hasattr(config, 'base_model_prefix'):
+            SbertForSequenceClassification.base_model_prefix = config.base_model_prefix
+        super().__init__(config, model_dir)
+
+    def build_base_model(self):
+        from .structbert import SbertModel
+        return SbertModel(self.config, add_pooling_layer=True)
+
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                token_type_ids=None,
+                labels=None,
+                **kwargs):
+        return super().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            labels=labels)
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        model_dir = kwargs.get('model_dir')
+        num_labels = kwargs.get('num_labels')
+        if num_labels is None:
+            label2id = parse_label_mapping(model_dir)
+            if label2id is not None and len(label2id) > 0:
+                num_labels = len(label2id)
+
+        model_args = {} if num_labels is None else {'num_labels': num_labels}
+        return super(SbertPreTrainedModel,
+                     SbertForSequenceClassification).from_pretrained(
+                         pretrained_model_name_or_path=kwargs.get('model_dir'),
+                         model_dir=kwargs.get('model_dir'),
+                         **model_args)
+
+
+@MODELS.register_module(Tasks.sentence_similarity, module_name=Models.veco)
+@MODELS.register_module(
+    Tasks.sentiment_classification, module_name=Models.veco)
+@MODELS.register_module(Tasks.nli, module_name=Models.veco)
+class VecoForSequenceClassification(TorchModel,
+                                    VecoForSequenceClassificationTransform):
+
+    def __init__(self, config, model_dir):
+        super().__init__(model_dir)
+        VecoForSequenceClassificationTransform.__init__(self, config)
+
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                token_type_ids=None,
+                position_ids=None,
+                head_mask=None,
+                inputs_embeds=None,
+                labels=None,
+                output_attentions=None,
+                output_hidden_states=None,
+                **kwargs):
+        return VecoForSequenceClassificationTransform.forward(
+            self,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            labels=labels)
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        model_dir = kwargs.get('model_dir')
+        num_labels = kwargs.get('num_labels')
+        if num_labels is None:
+            label2id = parse_label_mapping(model_dir)
+            if label2id is not None and len(label2id) > 0:
+                num_labels = len(label2id)
+
+        model_args = {} if num_labels is None else {'num_labels': num_labels}
+        return super(VecoForSequenceClassificationTransform,
+                     VecoForSequenceClassification).from_pretrained(
+                         pretrained_model_name_or_path=kwargs.get('model_dir'),
+                         model_dir=kwargs.get('model_dir'),
+                         **model_args)
diff --git a/modelscope/models/nlp/space/__init__.py b/modelscope/models/nlp/space/__init__.py
new file mode 100644
index 00000000..45f856c1
--- /dev/null
+++ b/modelscope/models/nlp/space/__init__.py
@@ -0,0 +1,28 @@
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .model import SpaceGenerator
+    from .model import SpaceModelBase, SpaceTokenizer, SpaceConfig
+    from .space_for_dialog_intent_prediction import SpaceForDialogIntent
+    from .space_for_dialog_modeling import SpaceForDialogModeling
+    from .space_for_dialog_state_tracking import SpaceForDialogStateTracking
+else:
+    _import_structure = {
+        'model':
+        ['SpaceGenerator', 'SpaceModelBase', 'SpaceTokenizer', 'SpaceConfig'],
+        'space_for_dialog_intent_prediction': ['SpaceForDialogIntent'],
+        'space_for_dialog_modeling': ['SpaceForDialogModeling'],
+        'space_for_dialog_state_tracking': ['SpaceForDialogStateTracking'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/nlp/space/model/__init__.py b/modelscope/models/nlp/space/model/__init__.py
new file mode 100644
index 00000000..24641f06
--- /dev/null
+++ b/modelscope/models/nlp/space/model/__init__.py
@@ -0,0 +1,10 @@
+from .configuration_space import SpaceConfig
+from .gen_unified_transformer import GenUnifiedTransformer
+from .generator import Generator as SpaceGenerator
+from .intent_unified_transformer import IntentUnifiedTransformer
+from .model_base import SpaceModelBase
+from .modeling_space import (SpaceForDST, SpaceForMaskedLM,
+                             SpaceForPreTraining, SpaceModel)
+from .tokenization_space import (BasicTokenizer, SpaceTokenizer,
+                                 WordpieceTokenizer)
+from .unified_transformer import UnifiedTransformer
diff --git a/modelscope/models/nlp/space/model/configuration_space.py b/modelscope/models/nlp/space/model/configuration_space.py
new file mode 100644
index 00000000..0da2d629
--- /dev/null
+++ b/modelscope/models/nlp/space/model/configuration_space.py
@@ -0,0 +1,32 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright 2018 The Google AI Language Team Authors.
+# Copyright 2020 The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Space configuration, mainly copied from :class:`~transformers.configuration_xlm_roberta` """
+
+from modelscope.models.nlp.structbert import SbertConfig
+from modelscope.utils import logger as logging
+
+logger = logging.get_logger(__name__)
+
+
+class SpaceConfig(SbertConfig):
+    """
+    This class overrides [`SbertConfig`]. Please check the superclass for the appropriate
+    documentation alongside usage examples.
+    """
+
+    model_type = 'space'
diff --git a/modelscope/models/nlp/backbones/space/model/gen_unified_transformer.py b/modelscope/models/nlp/space/model/gen_unified_transformer.py
similarity index 100%
rename from modelscope/models/nlp/backbones/space/model/gen_unified_transformer.py
rename to modelscope/models/nlp/space/model/gen_unified_transformer.py
diff --git a/modelscope/models/nlp/backbones/space/model/generator.py b/modelscope/models/nlp/space/model/generator.py
similarity index 100%
rename from modelscope/models/nlp/backbones/space/model/generator.py
rename to modelscope/models/nlp/space/model/generator.py
diff --git a/modelscope/models/nlp/backbones/space/model/intent_unified_transformer.py b/modelscope/models/nlp/space/model/intent_unified_transformer.py
similarity index 100%
rename from modelscope/models/nlp/backbones/space/model/intent_unified_transformer.py
rename to modelscope/models/nlp/space/model/intent_unified_transformer.py
diff --git a/modelscope/models/nlp/backbones/space/model/model_base.py b/modelscope/models/nlp/space/model/model_base.py
similarity index 100%
rename from modelscope/models/nlp/backbones/space/model/model_base.py
rename to modelscope/models/nlp/space/model/model_base.py
diff --git a/modelscope/models/nlp/space/model/modeling_space.py b/modelscope/models/nlp/space/model/modeling_space.py
new file mode 100644
index 00000000..f093cbc5
--- /dev/null
+++ b/modelscope/models/nlp/space/model/modeling_space.py
@@ -0,0 +1,268 @@
+# Copyright 2019 Facebook AI Research and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Space model. mainly copied from :module:`~transformers.modeling_xlm_roberta`"""
+
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from transformers.file_utils import add_start_docstrings
+
+from modelscope.models.nlp.structbert.modeling_sbert import (
+    SbertForMaskedLM, SbertModel, SbertPreTrainedModel)
+from .configuration_space import SpaceConfig
+
+SPACE_START_DOCSTRING = r"""
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+
+    Parameters:
+        config ([`SpaceConfig`]): Model configuration class with all the parameters of the
+            model. Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
+            weights.
+"""
+
+
+@add_start_docstrings(
+    'The bare Space Model transformer outputting raw hidden-states without any specific head on top. '
+    'It is identical with the Bert Model from Transformers',
+    SPACE_START_DOCSTRING,
+)
+class SpaceModel(SbertModel):
+    """
+    This class overrides [`SbertModel`]. Please check the superclass for the appropriate
+    documentation alongside usage examples.
+    """
+
+    config_class = SpaceConfig
+
+
+@add_start_docstrings(
+    """
+    Space Model transformer with Dialog state tracking heads on top (a inform projection
+    layer with a dialog state layer and a set of slots including history infromation from
+    previous dialog) e.g. for multiwoz2.2 tasks.
+    """,
+    SPACE_START_DOCSTRING,
+)
+class SpaceForDST(SbertPreTrainedModel):
+
+    def __init__(self, config):
+        super(SpaceForDST, self).__init__(config)
+        self.slot_list = config.dst_slot_list
+        self.class_types = config.dst_class_types
+        self.class_labels = config.dst_class_labels
+        self.token_loss_for_nonpointable = config.dst_token_loss_for_nonpointable
+        self.refer_loss_for_nonpointable = config.dst_refer_loss_for_nonpointable
+        self.class_aux_feats_inform = config.dst_class_aux_feats_inform
+        self.class_aux_feats_ds = config.dst_class_aux_feats_ds
+        self.class_loss_ratio = config.dst_class_loss_ratio
+
+        # Only use refer loss if refer class is present in dataset.
+        if 'refer' in self.class_types:
+            self.refer_index = self.class_types.index('refer')
+        else:
+            self.refer_index = -1
+
+        self.bert = SpaceModel(config)
+        self.dropout = nn.Dropout(config.dst_dropout_rate)
+        self.dropout_heads = nn.Dropout(config.dst_heads_dropout_rate)
+
+        if self.class_aux_feats_inform:
+            self.add_module(
+                'inform_projection',
+                nn.Linear(len(self.slot_list), len(self.slot_list)))
+        if self.class_aux_feats_ds:
+            self.add_module(
+                'ds_projection',
+                nn.Linear(len(self.slot_list), len(self.slot_list)))
+
+        aux_dims = len(self.slot_list) * (
+            self.class_aux_feats_inform + self.class_aux_feats_ds
+        )  # second term is 0, 1 or 2
+
+        for slot in self.slot_list:
+            self.add_module(
+                'class_' + slot,
+                nn.Linear(config.hidden_size + aux_dims, self.class_labels))
+            self.add_module('token_' + slot, nn.Linear(config.hidden_size, 2))
+            self.add_module(
+                'refer_' + slot,
+                nn.Linear(config.hidden_size + aux_dims,
+                          len(self.slot_list) + 1))
+
+        self.init_weights()
+
+    def forward(self,
+                input_ids,
+                input_mask=None,
+                segment_ids=None,
+                position_ids=None,
+                head_mask=None,
+                start_pos=None,
+                end_pos=None,
+                inform_slot_id=None,
+                refer_id=None,
+                class_label_id=None,
+                diag_state=None):
+        outputs = self.bert(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=segment_ids,
+            position_ids=position_ids,
+            head_mask=head_mask)
+
+        sequence_output = outputs[0]
+        pooled_output = outputs[1]
+
+        sequence_output = self.dropout(sequence_output)
+        pooled_output = self.dropout(pooled_output)
+
+        # TODO: establish proper format in labels already?
+        if inform_slot_id is not None:
+            inform_labels = torch.stack(list(inform_slot_id.values()),
+                                        1).float()
+        if diag_state is not None:
+            diag_state_labels = torch.clamp(
+                torch.stack(list(diag_state.values()), 1).float(), 0.0, 1.0)
+
+        total_loss = 0
+        per_slot_per_example_loss = {}
+        per_slot_class_logits = {}
+        per_slot_start_logits = {}
+        per_slot_end_logits = {}
+        per_slot_refer_logits = {}
+        for slot in self.slot_list:
+            if self.class_aux_feats_inform and self.class_aux_feats_ds:
+                pooled_output_aux = torch.cat(
+                    (pooled_output, self.inform_projection(inform_labels),
+                     self.ds_projection(diag_state_labels)), 1)
+            elif self.class_aux_feats_inform:
+                pooled_output_aux = torch.cat(
+                    (pooled_output, self.inform_projection(inform_labels)), 1)
+            elif self.class_aux_feats_ds:
+                pooled_output_aux = torch.cat(
+                    (pooled_output, self.ds_projection(diag_state_labels)), 1)
+            else:
+                pooled_output_aux = pooled_output
+            class_logits = self.dropout_heads(
+                getattr(self, 'class_' + slot)(pooled_output_aux))
+
+            token_logits = self.dropout_heads(
+                getattr(self, 'token_' + slot)(sequence_output))
+            start_logits, end_logits = token_logits.split(1, dim=-1)
+            start_logits = start_logits.squeeze(-1)
+            end_logits = end_logits.squeeze(-1)
+
+            refer_logits = self.dropout_heads(
+                getattr(self, 'refer_' + slot)(pooled_output_aux))
+
+            per_slot_class_logits[slot] = class_logits
+            per_slot_start_logits[slot] = start_logits
+            per_slot_end_logits[slot] = end_logits
+            per_slot_refer_logits[slot] = refer_logits
+
+            # If there are no labels, don't compute loss
+            if class_label_id is not None and start_pos is not None and end_pos is not None and refer_id is not None:
+                # If we are on multi-GPU, split add a dimension
+                if len(start_pos[slot].size()) > 1:
+                    start_pos[slot] = start_pos[slot].squeeze(-1)
+                if len(end_pos[slot].size()) > 1:
+                    end_pos[slot] = end_pos[slot].squeeze(-1)
+                # sometimes the start/end positions are outside our model inputs, we ignore these terms
+                ignored_index = start_logits.size(1)  # This is a single index
+                start_pos[slot].clamp_(0, ignored_index)
+                end_pos[slot].clamp_(0, ignored_index)
+
+                class_loss_fct = CrossEntropyLoss(reduction='none')
+                token_loss_fct = CrossEntropyLoss(
+                    reduction='none', ignore_index=ignored_index)
+                refer_loss_fct = CrossEntropyLoss(reduction='none')
+
+                start_loss = token_loss_fct(start_logits, start_pos[slot])
+                end_loss = token_loss_fct(end_logits, end_pos[slot])
+                token_loss = (start_loss + end_loss) / 2.0
+
+                token_is_pointable = (start_pos[slot] > 0).float()
+                if not self.token_loss_for_nonpointable:
+                    token_loss *= token_is_pointable
+
+                refer_loss = refer_loss_fct(refer_logits, refer_id[slot])
+                token_is_referrable = torch.eq(class_label_id[slot],
+                                               self.refer_index).float()
+                if not self.refer_loss_for_nonpointable:
+                    refer_loss *= token_is_referrable
+
+                class_loss = class_loss_fct(class_logits, class_label_id[slot])
+
+                if self.refer_index > -1:
+                    per_example_loss = (self.class_loss_ratio) * class_loss + (
+                        (1 - self.class_loss_ratio) / 2) * token_loss + (
+                            (1 - self.class_loss_ratio) / 2) * refer_loss
+                else:
+                    per_example_loss = self.class_loss_ratio * class_loss + (
+                        1 - self.class_loss_ratio) * token_loss
+
+                total_loss += per_example_loss.sum()
+                per_slot_per_example_loss[slot] = per_example_loss
+
+        # add hidden states and attention if they are here
+        outputs = (total_loss, ) + (
+            per_slot_per_example_loss,
+            per_slot_class_logits,
+            per_slot_start_logits,
+            per_slot_end_logits,
+            per_slot_refer_logits,
+        ) + outputs[2:]
+
+        return outputs
+
+
+@add_start_docstrings(
+    'The Space Model Model with a `language modeling` head on tops',
+    SPACE_START_DOCSTRING,
+)
+class SpaceForMaskedLM(SbertForMaskedLM):
+    """
+    This class overrides [`SbertForMaskedLM`]. Please check the superclass for the
+    appropriate documentation alongside usage examples.
+    """
+
+    config_class = SpaceConfig
+
+
+@add_start_docstrings(
+    """
+    Space Model with only one head on top as done during the pretraining: a `masked language modeling` head.
+    """,
+    SPACE_START_DOCSTRING,
+)
+class SpaceForPreTraining(SbertPreTrainedModel):
+
+    def __init__(self, model_name_or_path: str):
+        super(SpaceForPreTraining, self).__init__()
+        self.bert_model = SpaceForMaskedLM.from_pretrained(model_name_or_path)
+
+    def forward(self, input_ids: torch.tensor, mlm_labels: torch.tensor):
+        outputs = self.bert_model(input_ids, masked_lm_labels=mlm_labels)
+        return outputs[0]
diff --git a/modelscope/models/nlp/space/model/tokenization_space.py b/modelscope/models/nlp/space/model/tokenization_space.py
new file mode 100644
index 00000000..84712b7b
--- /dev/null
+++ b/modelscope/models/nlp/space/model/tokenization_space.py
@@ -0,0 +1,29 @@
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+"""Tokenization classes for Space. mainly copied from :module:`~transformers.tokenization_xlm_roberta`"""
+
+from modelscope.models.nlp.structbert import (BasicTokenizer, SbertTokenizer,
+                                              WordpieceTokenizer)
+from modelscope.utils import logger as logging
+
+logger = logging.get_logger(__name__)
+
+
+class SpaceTokenizer(SbertTokenizer):
+    """
+    This class overrides [`SpaceTokenizer`]. Please check the superclass for the appropriate
+    documentation alongside usage examples.
+    """
diff --git a/modelscope/models/nlp/backbones/space/model/unified_transformer.py b/modelscope/models/nlp/space/model/unified_transformer.py
similarity index 97%
rename from modelscope/models/nlp/backbones/space/model/unified_transformer.py
rename to modelscope/models/nlp/space/model/unified_transformer.py
index f5df954d..b0775541 100644
--- a/modelscope/models/nlp/backbones/space/model/unified_transformer.py
+++ b/modelscope/models/nlp/space/model/unified_transformer.py
@@ -5,10 +5,9 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
-from modelscope.models.nlp.backbones.space.model.model_base import \
-    SpaceModelBase
-from modelscope.models.nlp.backbones.space.modules.embedder import Embedder
-from modelscope.models.nlp.backbones.space.modules.transformer_block import \
+from modelscope.models.nlp.space.model.model_base import SpaceModelBase
+from modelscope.models.nlp.space.modules.embedder import Embedder
+from modelscope.models.nlp.space.modules.transformer_block import \
     TransformerBlock
 
 
diff --git a/modelscope/models/nlp/backbones/space/modules/__init__.py b/modelscope/models/nlp/space/modules/__init__.py
similarity index 100%
rename from modelscope/models/nlp/backbones/space/modules/__init__.py
rename to modelscope/models/nlp/space/modules/__init__.py
diff --git a/modelscope/models/nlp/backbones/space/modules/embedder.py b/modelscope/models/nlp/space/modules/embedder.py
similarity index 100%
rename from modelscope/models/nlp/backbones/space/modules/embedder.py
rename to modelscope/models/nlp/space/modules/embedder.py
diff --git a/modelscope/models/nlp/backbones/space/modules/feedforward.py b/modelscope/models/nlp/space/modules/feedforward.py
similarity index 100%
rename from modelscope/models/nlp/backbones/space/modules/feedforward.py
rename to modelscope/models/nlp/space/modules/feedforward.py
diff --git a/modelscope/models/nlp/backbones/space/modules/functions.py b/modelscope/models/nlp/space/modules/functions.py
similarity index 100%
rename from modelscope/models/nlp/backbones/space/modules/functions.py
rename to modelscope/models/nlp/space/modules/functions.py
diff --git a/modelscope/models/nlp/backbones/space/modules/multihead_attention.py b/modelscope/models/nlp/space/modules/multihead_attention.py
similarity index 100%
rename from modelscope/models/nlp/backbones/space/modules/multihead_attention.py
rename to modelscope/models/nlp/space/modules/multihead_attention.py
diff --git a/modelscope/models/nlp/backbones/space/modules/transformer_block.py b/modelscope/models/nlp/space/modules/transformer_block.py
similarity index 100%
rename from modelscope/models/nlp/backbones/space/modules/transformer_block.py
rename to modelscope/models/nlp/space/modules/transformer_block.py
diff --git a/modelscope/models/nlp/space_for_dialog_intent_prediction.py b/modelscope/models/nlp/space/space_for_dialog_intent_prediction.py
similarity index 97%
rename from modelscope/models/nlp/space_for_dialog_intent_prediction.py
rename to modelscope/models/nlp/space/space_for_dialog_intent_prediction.py
index bd0eb63b..c862fbef 100644
--- a/modelscope/models/nlp/space_for_dialog_intent_prediction.py
+++ b/modelscope/models/nlp/space/space_for_dialog_intent_prediction.py
@@ -7,7 +7,7 @@ from modelscope.metainfo import Models
 from modelscope.models import TorchModel
 from modelscope.models.base import Tensor
 from modelscope.models.builder import MODELS
-from modelscope.models.nlp.backbones import SpaceGenerator, SpaceModelBase
+from modelscope.models.nlp.space import SpaceGenerator, SpaceModelBase
 from modelscope.preprocessors.space import IntentBPETextField
 from modelscope.utils.config import Config
 from modelscope.utils.constant import ModelFile, Tasks
diff --git a/modelscope/models/nlp/space_for_dialog_modeling.py b/modelscope/models/nlp/space/space_for_dialog_modeling.py
similarity index 97%
rename from modelscope/models/nlp/space_for_dialog_modeling.py
rename to modelscope/models/nlp/space/space_for_dialog_modeling.py
index 60713c3d..8b9ed8b3 100644
--- a/modelscope/models/nlp/space_for_dialog_modeling.py
+++ b/modelscope/models/nlp/space/space_for_dialog_modeling.py
@@ -7,7 +7,7 @@ from modelscope.metainfo import Models
 from modelscope.models import TorchModel
 from modelscope.models.base import Tensor
 from modelscope.models.builder import MODELS
-from modelscope.models.nlp.backbones import SpaceGenerator, SpaceModelBase
+from modelscope.models.nlp.space import SpaceGenerator, SpaceModelBase
 from modelscope.preprocessors.space import MultiWOZBPETextField
 from modelscope.utils.config import Config
 from modelscope.utils.constant import ModelFile, Tasks
diff --git a/modelscope/models/nlp/space_for_dialog_state_tracking.py b/modelscope/models/nlp/space/space_for_dialog_state_tracking.py
similarity index 97%
rename from modelscope/models/nlp/space_for_dialog_state_tracking.py
rename to modelscope/models/nlp/space/space_for_dialog_state_tracking.py
index de5f95ce..ee7356b1 100644
--- a/modelscope/models/nlp/space_for_dialog_state_tracking.py
+++ b/modelscope/models/nlp/space/space_for_dialog_state_tracking.py
@@ -21,7 +21,7 @@ class SpaceForDialogStateTracking(TorchModel):
 
         super().__init__(model_dir, *args, **kwargs)
 
-        from sofa.models.space import SpaceConfig, SpaceForDST
+        from modelscope.models.nlp.space.model import SpaceForDST, SpaceConfig
         self.model_dir = model_dir
 
         self.config = SpaceConfig.from_pretrained(self.model_dir)
diff --git a/modelscope/models/nlp/structbert/__init__.py b/modelscope/models/nlp/structbert/__init__.py
new file mode 100644
index 00000000..d42db83c
--- /dev/null
+++ b/modelscope/models/nlp/structbert/__init__.py
@@ -0,0 +1,45 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .configuration_sbert import SbertConfig
+    from .modeling_sbert import (SbertForMaskedLM, SbertModel,
+                                 SbertPreTrainedModel)
+    from .tokenization_sbert import (BasicTokenizer, SbertTokenizer,
+                                     WordpieceTokenizer)
+    from .tokenization_sbert_fast import SbertTokenizerFast
+else:
+    _import_structure = {
+        'configuration_sbert': ['SbertConfig'],
+        'modeling_sbert':
+        ['SbertForMaskedLM', 'SbertModel', 'SbertPreTrainedModel'],
+        'tokenization_sbert':
+        ['BasicTokenizer', 'SbertTokenizer', 'WordpieceTokenizer'],
+        'tokenization_sbert_fast': ['SbertTokenizerFast'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/nlp/backbones/structbert/adv_utils.py b/modelscope/models/nlp/structbert/adv_utils.py
similarity index 96%
rename from modelscope/models/nlp/backbones/structbert/adv_utils.py
rename to modelscope/models/nlp/structbert/adv_utils.py
index 9864148f..44aae85c 100644
--- a/modelscope/models/nlp/backbones/structbert/adv_utils.py
+++ b/modelscope/models/nlp/structbert/adv_utils.py
@@ -59,7 +59,8 @@ def compute_adv_loss(embedding,
     """
     Calculate the adv loss of the model.
     :param embedding: Original sentense embedding
-    :param model: The model or the forward function(including decoder/classifier), accept kwargs as input, output logits
+    :param model: The model, or the forward function(including decoder/classifier),
+            accept kwargs as input, output logits
     :param ori_logits: The original logits outputed from the model function
     :param ori_loss: The original loss
     :param adv_grad_factor: This factor will be multipled by the KL loss grad and then the result will be added to
@@ -119,7 +120,8 @@ def compute_adv_loss_pair(embedding,
     """
     Calculate the adv loss of the model. This function is used in the pair logits scenerio.
     :param embedding: Original sentense embedding
-    :param model: The model or the forward function(including decoder/classifier), accept kwargs as input, output logits
+    :param model: The model, or the forward function(including decoder/classifier),
+            accept kwargs as input, output logits
     :param start_logits: The original start logits outputed from the model function
     :param end_logits: The original end logits outputed from the model function
     :param ori_loss: The original loss
diff --git a/modelscope/models/nlp/backbones/structbert/configuration_sbert.py b/modelscope/models/nlp/structbert/configuration_sbert.py
similarity index 94%
rename from modelscope/models/nlp/backbones/structbert/configuration_sbert.py
rename to modelscope/models/nlp/structbert/configuration_sbert.py
index 878b2216..374d4b62 100644
--- a/modelscope/models/nlp/backbones/structbert/configuration_sbert.py
+++ b/modelscope/models/nlp/structbert/configuration_sbert.py
@@ -24,11 +24,12 @@ logger = logging.get_logger(__name__)
 
 class SbertConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~sofa.models.SbertModel`.
+    This is the configuration class to store the configuration
+    of a :class:`~modelscope.models.nlp.structbert.SbertModel`.
     It is used to instantiate a SBERT model according to the specified arguments.
 
-    Configuration objects inherit from :class:`~sofa.utils.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~sofa.utils.PretrainedConfig` for more information.
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
 
 
     Args:
@@ -99,11 +100,13 @@ class SbertConfig(PretrainedConfig):
                  type_vocab_size=2,
                  initializer_range=0.02,
                  layer_norm_eps=1e-12,
+                 pad_token_id=0,
                  position_embedding_type='absolute',
                  use_cache=True,
                  classifier_dropout=None,
                  **kwargs):
-        super().__init__(**kwargs)
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size
         self.num_hidden_layers = num_hidden_layers
diff --git a/modelscope/models/nlp/structbert/modeling_sbert.py b/modelscope/models/nlp/structbert/modeling_sbert.py
new file mode 100755
index 00000000..bbac3c95
--- /dev/null
+++ b/modelscope/models/nlp/structbert/modeling_sbert.py
@@ -0,0 +1,1964 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch SBERT model. mainly copied from :module:`~transformers.modeling_bert`"""
+
+import math
+import warnings
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.utils.checkpoint
+from packaging import version
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.activations import ACT2FN
+from transformers.file_utils import (ModelOutput, add_code_sample_docstrings,
+                                     add_start_docstrings,
+                                     add_start_docstrings_to_model_forward,
+                                     replace_return_docstrings)
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions, MaskedLMOutput,
+    MultipleChoiceModelOutput, NextSentencePredictorOutput,
+    QuestionAnsweringModelOutput, SequenceClassifierOutput,
+    TokenClassifierOutput)
+from transformers.modeling_utils import (PreTrainedModel,
+                                         apply_chunking_to_forward,
+                                         find_pruneable_heads_and_indices,
+                                         prune_linear_layer)
+
+from modelscope.metainfo import Models
+from modelscope.models.builder import BACKBONES
+from modelscope.utils.constant import Fields
+from modelscope.utils.logger import get_logger
+from .adv_utils import compute_adv_loss, compute_adv_loss_pair
+from .configuration_sbert import SbertConfig
+
+logger = get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = 'chinese_sbert-large-std-512'
+_CONFIG_FOR_DOC = 'SbertConfig'
+_TOKENIZER_FOR_DOC = 'SbertTokenizer'
+
+
+class SbertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(
+            config.vocab_size,
+            config.hidden_size,
+            padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
+                                                config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
+                                                  config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.position_embedding_type = getattr(config,
+                                               'position_embedding_type',
+                                               'absolute')
+        self.register_buffer(
+            'position_ids',
+            torch.arange(config.max_position_embeddings).expand((1, -1)))
+        if version.parse(torch.__version__) > version.parse('1.6.0'):
+            self.register_buffer(
+                'token_type_ids',
+                torch.zeros(
+                    self.position_ids.size(),
+                    dtype=torch.long,
+                    device=self.position_ids.device),
+                persistent=False,
+            )
+
+    def forward(self,
+                input_ids=None,
+                token_type_ids=None,
+                position_ids=None,
+                inputs_embeds=None,
+                past_key_values_length=0,
+                return_inputs_embeds=False):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:,
+                                             past_key_values_length:seq_length
+                                             + past_key_values_length]
+
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users
+        # when tracing the model without passing token_type_ids, solves
+        # issue #5664
+        if token_type_ids is None:
+            if hasattr(self, 'token_type_ids'):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(
+                    input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(
+                    input_shape,
+                    dtype=torch.long,
+                    device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == 'absolute':
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        if not return_inputs_embeds:
+            return embeddings
+        else:
+            return embeddings, inputs_embeds
+
+
+class SbertSelfAttention(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(
+                config, 'embedding_size'):
+            raise ValueError(
+                f'The hidden size ({config.hidden_size}) is not a multiple of the number of attention '
+                f'heads ({config.num_attention_heads})')
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size
+                                       / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(config,
+                                               'position_embedding_type',
+                                               'absolute')
+        if self.position_embedding_type == 'relative_key' or self.position_embedding_type == 'relative_key_query':
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(
+                2 * config.max_position_embeddings - 1,
+                self.attention_head_size)
+
+        self.is_decoder = config.is_decoder
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads,
+                                       self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(
+                self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(
+                self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer,
+                                        key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == 'relative_key' or self.position_embedding_type == 'relative_key_query':
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(
+                seq_length, dtype=torch.long,
+                device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(
+                seq_length, dtype=torch.long,
+                device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(
+                distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(
+                dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == 'relative_key':
+                relative_position_scores = torch.einsum(
+                    'bhld,lrd->bhlr', query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == 'relative_key_query':
+                relative_position_scores_query = torch.einsum(
+                    'bhld,lrd->bhlr', query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum(
+                    'bhrd,lrd->bhlr', key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(
+            self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in SbertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (
+            self.all_head_size, )
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (context_layer,
+                   attention_probs) if output_attentions else (context_layer, )
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_value, )
+        return outputs
+
+
+class SbertSelfOutput(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class SbertAttention(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.self = SbertSelfAttention(config)
+        self.output = SbertSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads,
+            self.self.attention_head_size, self.pruned_heads)
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(
+            heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,
+                   ) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class SbertIntermediate(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class SbertOutput(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class SbertLayer(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = SbertAttention(config)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise ValueError(
+                    f'{self} should be used as a decoder model if cross attention is added'
+                )
+            self.crossattention = SbertAttention(config)
+        self.intermediate = SbertIntermediate(config)
+        self.output = SbertOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:
+                                                  2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[
+                1:]  # add self attentions if we output attention weights
+
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, 'crossattention'):
+                raise ValueError(
+                    f'If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention '
+                    f'layers by setting `config.add_cross_attention=True`')
+
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
+            cross_attn_past_key_value = past_key_value[
+                -2:] if past_key_value is not None else None
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                cross_attn_past_key_value,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[
+                1:-1]  # add cross attentions if we output attention weights
+
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        layer_output = apply_chunking_to_forward(self.feed_forward_chunk,
+                                                 self.chunk_size_feed_forward,
+                                                 self.seq_len_dim,
+                                                 attention_output)
+        outputs = (layer_output, ) + outputs
+
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value, )
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class SbertEncoder(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList(
+            [SbertLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = (
+        ) if output_attentions and self.config.add_cross_attention else None
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states, )
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[
+                i] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                if use_cache:
+                    logger.warning(
+                        '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...'
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value,
+                                      output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1], )
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (
+                    layer_outputs[1], )
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (
+                        layer_outputs[2], )
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states, )
+
+        if not return_dict:
+            return tuple(v for v in [
+                hidden_states,
+                next_decoder_cache,
+                all_hidden_states,
+                all_self_attentions,
+                all_cross_attentions,
+            ] if v is not None)
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class SbertPooler(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class SbertPredictionHeadTransform(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class SbertLMPredictionHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.transform = SbertPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size)
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+class SbertOnlyMLMHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = SbertLMPredictionHead(config)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class SbertOnlyNSPHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, pooled_output):
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return seq_relationship_score
+
+
+class SbertPreTrainingHeads(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = SbertLMPredictionHead(config)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+
+class SbertPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = SbertConfig
+    base_model_prefix = 'bert'
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [r'position_ids']
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, SbertEncoder):
+            module.gradient_checkpointing = value
+
+
+@dataclass
+class SbertForPreTrainingOutput(ModelOutput):
+    """
+    Output type of :class:`~transformers.BertForPreTraining`.
+
+    Args:
+        loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
+            Total loss as the sum of the masked language modeling loss and the next sequence prediction
+            (classification) loss.
+        prediction_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        seq_relationship_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
+            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
+            before SoftMax).
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True``
+            is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True``
+            is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    prediction_logits: torch.FloatTensor = None
+    seq_relationship_logits: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+SBERT_START_DOCSTRING = r"""
+
+    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+
+    Parameters:
+        config (:class:`~modelscope.models.nlp.structbert.SbertConfig`): Model configuration class with
+            all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
+"""
+
+SBERT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            `What are attention masks? <../glossary.html#attention-mask>`__
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
+            1]``:
+
+            - 0 corresponds to a `sentence A` token,
+            - 1 corresponds to a `sentence B` token.
+
+            `What are token type IDs? <../glossary.html#token-type-ids>`_
+        position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+            config.max_position_embeddings - 1]``.
+
+            `What are position IDs? <../glossary.html#position-ids>`_
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.ModelOutput` instead of a plain tuple.
+"""
+
+
+@dataclass
+class BaseModelOutputWithPoolingAndCrossAttentionsWithEmbedding(
+        BaseModelOutputWithPoolingAndCrossAttentions):
+    embedding_output: torch.FloatTensor = None
+    logits: Optional[Union[tuple, torch.FloatTensor]] = None
+    kwargs: dict = None
+
+
+@add_start_docstrings(
+    'The Sbert Model transformer outputting raw hidden-states without any specific head on top.',
+    SBERT_START_DOCSTRING,
+)
+class SbertModel(SbertPreTrainedModel):
+    """
+
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in `Attention is
+    all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the :obj:`is_decoder` argument of the configuration
+    set to :obj:`True`. To be used in a Seq2Seq model, the model needs to initialized with both :obj:`is_decoder`
+    argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
+    input to the forward pass.
+    """
+
+    def __init__(self, config: SbertConfig, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = SbertEmbeddings(config)
+        self.encoder = SbertEncoder(config)
+
+        self.pooler = SbertPooler(config) if add_pooling_layer else None
+
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(
+        SBERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPoolingAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`,
+            `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple
+            having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        """
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else
+            self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                'You cannot specify both input_ids and inputs_embeds at the same time'
+            )
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError(
+                'You have to specify either input_ids or inputs_embeds')
+
+        batch_size, seq_length = input_shape
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[
+            2] if past_key_values is not None else 0
+
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                ((batch_size, seq_length + past_key_values_length)),
+                device=device)
+
+        if token_type_ids is None:
+            if hasattr(self.embeddings, 'token_type_ids'):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :
+                                                                         seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(
+                    batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(
+                    input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(
+            attention_mask, input_shape, device)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size(
+            )
+            encoder_hidden_shape = (encoder_batch_size,
+                                    encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(
+                    encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(
+                encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask,
+                                       self.config.num_hidden_layers)
+
+        embedding_output, orignal_embeds = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+            return_inputs_embeds=True,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(
+            sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output,
+                    pooled_output) + encoder_outputs[1:] + (orignal_embeds, )
+
+        return BaseModelOutputWithPoolingAndCrossAttentionsWithEmbedding(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+            embedding_output=orignal_embeds)
+
+
+@add_start_docstrings(
+    """
+    Sbert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next
+    sentence prediction (classification)` head.
+    """,
+    SBERT_START_DOCSTRING,
+)
+class SbertForPreTraining(SbertPreTrainedModel):
+
+    def __init__(self, config: SbertConfig):
+        super().__init__(config)
+
+        self.bert = SbertModel(config)
+        self.cls = SbertPreTrainingHeads(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(
+        SBERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @replace_return_docstrings(
+        output_type=SbertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        next_sentence_label=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape ``(batch_size, sequence_length)``, `optional`):
+            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+        next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`):
+            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
+            (see :obj:`input_ids` docstring) Indices should be in ``[0, 1]``:
+
+            - 0 indicates sequence B is a continuation of sequence A,
+            - 1 indicates sequence B is a random sequence.
+        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
+            Used to hide legacy arguments that have been deprecated.
+
+        Returns:
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output, pooled_output = outputs[:2]
+        prediction_scores, seq_relationship_score = self.cls(
+            sequence_output, pooled_output)
+
+        total_loss = None
+        if labels is not None and next_sentence_label is not None:
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(
+                prediction_scores.view(-1, self.config.vocab_size),
+                labels.view(-1))
+            next_sentence_loss = loss_fct(
+                seq_relationship_score.view(-1, 2),
+                next_sentence_label.view(-1))
+            total_loss = masked_lm_loss + next_sentence_loss
+
+        if not return_dict:
+            output = (prediction_scores,
+                      seq_relationship_score) + outputs[2:-1]
+            return ((total_loss, )
+                    + output) if total_loss is not None else output
+
+        return SbertForPreTrainingOutput(
+            loss=total_loss,
+            prediction_logits=prediction_scores,
+            seq_relationship_logits=seq_relationship_score,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """Sbert Model with a `language modeling` head on top for CLM fine-tuning. """,
+    SBERT_START_DOCSTRING)
+class SbertLMHeadModel(SbertPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+    _keys_to_ignore_on_load_missing = [
+        r'position_ids', r'predictions.decoder.bias'
+    ]
+
+    def __init__(self, config: SbertConfig):
+        super().__init__(config)
+
+        if not config.is_decoder:
+            logger.warning(
+                'If you want to use `SbertLMHeadModel` as a standalone, add `is_decoder=True.`'
+            )
+
+        self.bert = SbertModel(config, add_pooling_layer=False)
+        self.cls = SbertOnlyMLMHead(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(
+        SBERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @replace_return_docstrings(
+        output_type=CausalLMOutputWithCrossAttentions,
+        config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`,
+            `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
+            ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]``
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers`
+            with each tuple having 4 tensors of
+            shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+
+        Returns:
+
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        lm_loss = None
+        if labels is not None:
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shifted_prediction_scores = prediction_scores[:, :
+                                                          -1, :].contiguous()
+            labels = labels[:, 1:].contiguous()
+            loss_fct = CrossEntropyLoss()
+            lm_loss = loss_fct(
+                shifted_prediction_scores.view(-1, self.config.vocab_size),
+                labels.view(-1))
+        if not return_dict:
+            output = (prediction_scores, ) + outputs[2:-1]
+            return ((lm_loss, ) + output) if lm_loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def prepare_inputs_for_generation(self,
+                                      input_ids,
+                                      past=None,
+                                      attention_mask=None,
+                                      **model_kwargs):
+        input_shape = input_ids.shape
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_shape)
+
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {
+            'input_ids': input_ids,
+            'attention_mask': attention_mask,
+            'past_key_values': past
+        }
+
+    def _reorder_cache(self, past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            reordered_past += (tuple(
+                past_state.index_select(0, beam_idx)
+                for past_state in layer_past), )
+        return reordered_past
+
+
+@add_start_docstrings(
+    """Sbert Model with a `language modeling` head on top. """,
+    SBERT_START_DOCSTRING)
+class SbertForMaskedLM(SbertPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+    _keys_to_ignore_on_load_missing = [
+        r'position_ids', r'predictions.decoder.bias'
+    ]
+
+    def __init__(self, config: SbertConfig):
+        super().__init__(config)
+
+        if config.is_decoder:
+            logger.warning(
+                'If you want to use `SbertForMaskedLM` make sure `config.is_decoder=False` for '
+                'bi-directional self-attention.')
+
+        self.bert = SbertModel(config)
+        self.cls = SbertOnlyMLMHead(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(
+        SBERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MaskedLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(
+                prediction_scores.view(-1, self.config.vocab_size),
+                labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores, ) + outputs[2:-1]
+            return ((masked_lm_loss, )
+                    + output) if masked_lm_loss is not None else output
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(self,
+                                      input_ids,
+                                      attention_mask=None,
+                                      **model_kwargs):
+        input_shape = input_ids.shape
+        effective_batch_size = input_shape[0]
+
+        #  add a dummy token
+        assert self.config.pad_token_id is not None, 'The PAD token should be defined for generation'
+        attention_mask_zero = attention_mask.new_zeros(
+            (attention_mask.shape[0], 1))
+        attention_mask = torch.cat([attention_mask, attention_mask_zero],
+                                   dim=-1)
+        dummy_token = torch.full((effective_batch_size, 1),
+                                 self.config.pad_token_id,
+                                 dtype=torch.long,
+                                 device=input_ids.device)
+        input_ids = torch.cat([input_ids, dummy_token], dim=1)
+
+        return {'input_ids': input_ids, 'attention_mask': attention_mask}
+
+
+@add_start_docstrings(
+    """Sbert Model with a `next sentence prediction (classification)` head on top. """,
+    SBERT_START_DOCSTRING,
+)
+class SbertForNextSentencePrediction(SbertPreTrainedModel):
+
+    def __init__(self, config: SbertConfig):
+        super().__init__(config)
+
+        self.bert = SbertModel(config)
+        self.cls = SbertOnlyNSPHead(config)
+
+        self.init_weights()
+
+    @add_start_docstrings_to_model_forward(
+        SBERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @replace_return_docstrings(
+        output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        **kwargs,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
+            (see ``input_ids`` docstring). Indices should be in ``[0, 1]``:
+
+            - 0 indicates sequence B is a continuation of sequence A,
+            - 1 indicates sequence B is a random sequence.
+
+        Returns:
+
+        """
+
+        if 'next_sentence_label' in kwargs:
+            warnings.warn(
+                'The `next_sentence_label` argument is deprecated and will be removed '
+                'in a future version, use `labels` instead.',
+                FutureWarning,
+            )
+            labels = kwargs.pop('next_sentence_label')
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        seq_relationship_scores = self.cls(pooled_output)
+
+        next_sentence_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            next_sentence_loss = loss_fct(
+                seq_relationship_scores.view(-1, 2), labels.view(-1))
+
+        if not return_dict:
+            output = (seq_relationship_scores, ) + outputs[2:-1]
+            return ((next_sentence_loss, )
+                    + output) if next_sentence_loss is not None else output
+
+        return NextSentencePredictorOutput(
+            loss=next_sentence_loss,
+            logits=seq_relationship_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Sbert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
+    output) e.g. for GLUE tasks.
+    """,
+    SBERT_START_DOCSTRING,
+)
+class SbertForSequenceClassification(SbertPreTrainedModel):
+
+    def __init__(self, config: SbertConfig):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+        if self.config.adv_grad_factor is None:
+            logger.warning(
+                'Adv parameters not set, skipping compute_adv_loss.')
+        self.bert = SbertModel(config)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None
+            else config.hidden_dropout_prob)
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        self.init_weights()
+
+    def _forward_call(self, **kwargs):
+        outputs = self.bert(**kwargs)
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        outputs['logits'] = logits
+        outputs.kwargs = kwargs
+        return outputs
+
+    @add_start_docstrings_to_model_forward(
+        SBERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                token_type_ids=None,
+                position_ids=None,
+                head_mask=None,
+                inputs_embeds=None,
+                labels=None,
+                output_attentions=None,
+                output_hidden_states=None,
+                return_dict=None,
+                **kwargs):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
+            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if not return_dict:
+            logger.error('Return tuple in sbert is not supported now.')
+        outputs = self._forward_call(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict)
+        return self.compute_loss(outputs, labels, **outputs.kwargs)
+
+    def compute_loss(self, outputs, labels, **kwargs):
+        logits = outputs.logits
+        embedding_output = outputs.embedding_output
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = 'regression'
+                elif self.num_labels > 1 and (labels.dtype == torch.long
+                                              or labels.dtype == torch.int):
+                    self.config.problem_type = 'single_label_classification'
+                else:
+                    self.config.problem_type = 'multi_label_classification'
+
+            if self.config.problem_type == 'regression':
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == 'single_label_classification':
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(
+                    logits.view(-1, self.num_labels), labels.view(-1))
+                if self.config.adv_grad_factor is not None and self.training:
+                    loss = compute_adv_loss(
+                        embedding=embedding_output,
+                        model=self._forward_call,
+                        ori_logits=logits,
+                        ori_loss=loss,
+                        adv_bound=self.config.adv_bound,
+                        adv_grad_factor=self.config.adv_grad_factor,
+                        sigma=self.config.sigma,
+                        **kwargs)
+            elif self.config.problem_type == 'multi_label_classification':
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Sbert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    SBERT_START_DOCSTRING,
+)
+class SbertForMultipleChoice(SbertPreTrainedModel):
+
+    def __init__(self, config: SbertConfig):
+        super().__init__(config)
+        self.config = config
+        if self.config.adv_grad_factor is None:
+            logger.warning(
+                'Adv parameters not set, skipping compute_adv_loss.')
+        self.bert = SbertModel(config)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None
+            else config.hidden_dropout_prob)
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+        self.init_weights()
+
+    def _forward_call(self, num_choices, **kwargs):
+        outputs = self.bert(**kwargs)
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        outputs['logits'] = logits.view(-1, num_choices)
+        kwargs['num_choices'] = num_choices
+        outputs.kwargs = kwargs
+        return outputs
+
+    @add_start_docstrings_to_model_forward(
+        SBERT_INPUTS_DOCSTRING.format(
+            'batch_size, num_choices, sequence_length'))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MultipleChoiceModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
+            num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
+            :obj:`input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if not return_dict:
+            logger.error('Return tuple in sbert is not supported now.')
+
+        num_choices = input_ids.shape[
+            1] if input_ids is not None else inputs_embeds.shape[1]
+
+        input_ids = input_ids.view(
+            -1, input_ids.size(-1)) if input_ids is not None else None
+        attention_mask = attention_mask.view(
+            -1,
+            attention_mask.size(-1)) if attention_mask is not None else None
+        token_type_ids = token_type_ids.view(
+            -1,
+            token_type_ids.size(-1)) if token_type_ids is not None else None
+        position_ids = position_ids.view(
+            -1, position_ids.size(-1)) if position_ids is not None else None
+        inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2),
+                               inputs_embeds.size(-1))
+            if inputs_embeds is not None else None)
+
+        outputs = self._forward_call(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            num_choices=num_choices)
+
+        reshaped_logits = outputs.logits
+        kwargs = outputs.kwargs
+        embedding_output = outputs.embedding_output
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+            if self.config.adv_grad_factor is not None and self.training:
+                loss = compute_adv_loss(
+                    embedding=embedding_output,
+                    model=self._forward_call,
+                    ori_logits=reshaped_logits,
+                    ori_loss=loss,
+                    adv_bound=self.config.adv_bound,
+                    adv_grad_factor=self.config.adv_grad_factor,
+                    sigma=self.config.sigma,
+                    **kwargs)
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Sbert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    SBERT_START_DOCSTRING,
+)
+class SbertForTokenClassification(SbertPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+
+    def __init__(self, config: SbertConfig):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+        if self.config.adv_grad_factor is None:
+            logger.warning(
+                'Adv parameters not set, skipping compute_adv_loss.')
+        self.bert = SbertModel(config, add_pooling_layer=False)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None
+            else config.hidden_dropout_prob)
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.init_weights()
+
+    def _forward_call(self, **kwargs):
+        outputs = self.bert(**kwargs)
+        sequence_output = outputs[0]
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+        outputs['logits'] = logits
+        outputs.kwargs = kwargs
+        return outputs
+
+    @add_start_docstrings_to_model_forward(
+        SBERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
+            1]``.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if not return_dict:
+            logger.error('Return tuple in sbert is not supported now.')
+
+        outputs = self._forward_call(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict)
+
+        logits = outputs.logits
+        embedding_output = outputs.embedding_output
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # Only keep active parts of the loss
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)
+                active_labels = torch.where(
+                    active_loss, labels.view(-1),
+                    torch.tensor(loss_fct.ignore_index).type_as(labels))
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(
+                    logits.view(-1, self.num_labels), labels.view(-1))
+            if self.config.adv_grad_factor is not None and self.training:
+                loss = compute_adv_loss(
+                    embedding=embedding_output,
+                    model=self._forward_call,
+                    ori_logits=logits,
+                    ori_loss=loss,
+                    adv_bound=self.config.adv_bound,
+                    adv_grad_factor=self.config.adv_grad_factor,
+                    sigma=self.config.sigma,
+                    with_attention_mask=attention_mask is not None,
+                    **outputs.kwargs)
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Sbert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    SBERT_START_DOCSTRING,
+)
+class SbertForQuestionAnswering(SbertPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+
+    def __init__(self, config: SbertConfig):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+        if self.config.adv_grad_factor is None:
+            logger.warning(
+                'Adv parameters not set, skipping compute_adv_loss.')
+        self.bert = SbertModel(config, add_pooling_layer=False)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.init_weights()
+
+    def _forward_call(self, **kwargs):
+        outputs = self.bert(**kwargs)
+        sequence_output = outputs[0]
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+        outputs['logits'] = (start_logits, end_logits)
+        outputs.kwargs = kwargs
+        return outputs
+
+    @add_start_docstrings_to_model_forward(
+        SBERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=QuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+            sequence are not taken into account for computing the loss.
+        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+            sequence are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if not return_dict:
+            logger.error('Return tuple in sbert is not supported now.')
+
+        outputs = self._forward_call(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict)
+        return self.compute_loss(outputs, start_positions, end_positions,
+                                 **outputs.kwargs)
+
+    def compute_loss(self,
+                     outputs,
+                     start_positions=None,
+                     end_positions=None,
+                     **kwargs):
+        start_logits, end_logits = outputs.logits
+        embedding_output = outputs.embedding_output
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+            if self.config.adv_grad_factor is not None and self.training:
+                total_loss = compute_adv_loss_pair(
+                    embedding=embedding_output,
+                    model=self._forward_call,
+                    start_logits=start_logits,
+                    end_logits=end_logits,
+                    ori_loss=total_loss,
+                    adv_bound=self.config.adv_bound,
+                    adv_grad_factor=self.config.adv_grad_factor,
+                    sigma=self.config.sigma,
+                    **kwargs)
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/modelscope/models/nlp/structbert/tokenization_sbert.py b/modelscope/models/nlp/structbert/tokenization_sbert.py
new file mode 100644
index 00000000..6db69509
--- /dev/null
+++ b/modelscope/models/nlp/structbert/tokenization_sbert.py
@@ -0,0 +1,516 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for Sbert. mainly copied from :module:`~transformers.tokenization_bert`"""
+
+import collections
+import os
+import unicodedata
+from typing import List, Optional, Tuple
+
+from transformers.tokenization_utils import (PreTrainedTokenizer, _is_control,
+                                             _is_punctuation, _is_whitespace)
+
+from modelscope.utils.logger import get_logger
+
+logger = get_logger(__name__)
+
+VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'}
+
+PRETRAINED_VOCAB_FILES_MAP = {'vocab_file': {}}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    'chinese_sbert-large-std-512': 512,
+    'english_sbert-large-std-512': 512,
+}
+
+PRETRAINED_INIT_CONFIGURATION = {
+    'english_sbert-large-std-512': {
+        'do_lower_case': True
+    },
+}
+
+
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    with open(vocab_file, 'r', encoding='utf-8') as reader:
+        tokens = reader.readlines()
+    for index, token in enumerate(tokens):
+        token = token.rstrip('\n')
+        vocab[token] = index
+    return vocab
+
+
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a piece of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+
+
+class SbertTokenizer(PreTrainedTokenizer):
+    r"""
+    Construct a SBERT tokenizer. Based on WordPiece.
+
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    Users should refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (:obj:`str`):
+            File containing the vocabulary.
+        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not to lowercase the input when tokenizing.
+        do_basic_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not to do basic tokenization before WordPiece.
+        never_split (:obj:`Iterable`, `optional`):
+            Collection of tokens which will never be split during tokenization. Only has an effect when
+            :obj:`do_basic_tokenize=True`
+        unk_token (:obj:`str`, `optional`, defaults to :obj:`"[UNK]"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not to tokenize Chinese characters.
+
+            This should likely be deactivated for Japanese (see this `issue
+            <https://github.com/huggingface/transformers/issues/328>`__).
+        strip_accents: (:obj:`bool`, `optional`):
+            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
+            value for :obj:`lowercase` (as in the original BERT).
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(self,
+                 vocab_file,
+                 do_lower_case=True,
+                 do_basic_tokenize=True,
+                 never_split=None,
+                 unk_token='[UNK]',
+                 sep_token='[SEP]',
+                 pad_token='[PAD]',
+                 cls_token='[CLS]',
+                 mask_token='[MASK]',
+                 tokenize_chinese_chars=True,
+                 strip_accents=None,
+                 **kwargs):
+        super().__init__(
+            do_lower_case=do_lower_case,
+            do_basic_tokenize=do_basic_tokenize,
+            never_split=never_split,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            tokenize_chinese_chars=tokenize_chinese_chars,
+            strip_accents=strip_accents,
+            **kwargs,
+        )
+
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained "
+                'model use `tokenizer = SbertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`'
+            )
+        self.vocab = load_vocab(vocab_file)
+        self.ids_to_tokens = collections.OrderedDict([
+            (ids, tok) for tok, ids in self.vocab.items()
+        ])
+        self.do_basic_tokenize = do_basic_tokenize
+        if do_basic_tokenize:
+            self.basic_tokenizer = BasicTokenizer(
+                do_lower_case=do_lower_case,
+                never_split=never_split,
+                tokenize_chinese_chars=tokenize_chinese_chars,
+                strip_accents=strip_accents,
+            )
+        self.wordpiece_tokenizer = WordpieceTokenizer(
+            vocab=self.vocab, unk_token=self.unk_token)
+
+    @property
+    def do_lower_case(self):
+        return self.basic_tokenizer.do_lower_case
+
+    @property
+    def vocab_size(self):
+        return len(self.vocab)
+
+    def get_vocab(self):
+        return dict(self.vocab, **self.added_tokens_encoder)
+
+    def _tokenize(self, text):
+        split_tokens = []
+        if self.do_basic_tokenize:
+            for token in self.basic_tokenizer.tokenize(
+                    text, never_split=self.all_special_tokens):
+
+                # If the token is part of the never_split set
+                if token in self.basic_tokenizer.never_split:
+                    split_tokens.append(token)
+                else:
+                    split_tokens += self.wordpiece_tokenizer.tokenize(token)
+        else:
+            split_tokens = self.wordpiece_tokenizer.tokenize(text)
+        return split_tokens
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.vocab.get(token, self.vocab.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.ids_to_tokens.get(index, self.unk_token)
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        out_string = ' '.join(tokens).replace(' ##', '').strip()
+        return out_string
+
+    def build_inputs_with_special_tokens(
+            self,
+            token_ids_0: List[int],
+            token_ids_1: Optional[List[int]] = None) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A SBERT sequence has the following format:
+
+        - single sequence: ``[CLS] X [SEP]``
+        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+
+    def get_special_tokens_mask(
+            self,
+            token_ids_0: List[int],
+            token_ids_1: Optional[List[int]] = None,
+            already_has_special_tokens: bool = False) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``prepare_for_model`` method.
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0,
+                token_ids_1=token_ids_1,
+                already_has_special_tokens=True)
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + (
+                [0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
+    def create_token_type_ids_from_sequences(
+            self,
+            token_ids_0: List[int],
+            token_ids_1: Optional[List[int]] = None) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A SBERT sequence
+        pair mask has the following format:
+
+        ::
+
+            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+            | first sequence    | second sequence |
+
+        If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            sequence(s).
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1
+                                                        + sep) * [1]
+
+    def save_vocabulary(self,
+                        save_directory: str,
+                        filename_prefix: Optional[str] = None) -> Tuple[str]:
+        index = 0
+        if os.path.isdir(save_directory):
+            vocab_file = os.path.join(
+                save_directory,
+                (filename_prefix + '-' if filename_prefix else '')
+                + VOCAB_FILES_NAMES['vocab_file'])
+        else:
+            vocab_file = (filename_prefix
+                          + '-' if filename_prefix else '') + save_directory
+        with open(vocab_file, 'w', encoding='utf-8') as writer:
+            for token, token_index in sorted(
+                    self.vocab.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        f'Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive.'
+                        ' Please check that the vocabulary is not corrupted!')
+                    index = token_index
+                writer.write(token + '\n')
+                index += 1
+        return (vocab_file, )
+
+
+class BasicTokenizer(object):
+    """
+    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
+
+    Args:
+        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not to lowercase the input when tokenizing.
+        never_split (:obj:`Iterable`, `optional`):
+            Collection of tokens which will never be split during tokenization. Only has an effect when
+            :obj:`do_basic_tokenize=True`
+        tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not to tokenize Chinese characters.
+
+            This should likely be deactivated for Japanese (see this `issue
+            <https://github.com/huggingface/transformers/issues/328>`__).
+        strip_accents: (:obj:`bool`, `optional`):
+            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
+            value for :obj:`lowercase` (as in the original BERT).
+    """
+
+    def __init__(self,
+                 do_lower_case=True,
+                 never_split=None,
+                 tokenize_chinese_chars=True,
+                 strip_accents=None):
+        if never_split is None:
+            never_split = []
+        self.do_lower_case = do_lower_case
+        self.never_split = set(never_split)
+        self.tokenize_chinese_chars = tokenize_chinese_chars
+        self.strip_accents = strip_accents
+
+    def tokenize(self, text, never_split=None):
+        """
+        Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
+        WordPieceTokenizer.
+
+        Args:
+            **never_split**: (`optional`) list of str
+                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
+                :func:`PreTrainedTokenizer.tokenize`) List of token not to split.
+        """
+        # union() returns a new set by concatenating the two sets.
+        never_split = self.never_split.union(
+            set(never_split)) if never_split else self.never_split
+        text = self._clean_text(text)
+
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        if self.tokenize_chinese_chars:
+            text = self._tokenize_chinese_chars(text)
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        for token in orig_tokens:
+            if token not in never_split:
+                if self.do_lower_case:
+                    token = token.lower()
+                    if self.strip_accents is not False:
+                        token = self._run_strip_accents(token)
+                elif self.strip_accents:
+                    token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token, never_split))
+
+        output_tokens = whitespace_tokenize(' '.join(split_tokens))
+        return output_tokens
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize('NFD', text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == 'Mn':
+                continue
+            output.append(char)
+        return ''.join(output)
+
+    def _run_split_on_punc(self, text, never_split=None):
+        """Splits punctuation on a piece of text."""
+        if never_split is not None and text in never_split:
+            return [text]
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return [''.join(x) for x in output]
+
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(' ')
+                output.append(char)
+                output.append(' ')
+            else:
+                output.append(char)
+        return ''.join(output)
+
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if ((0x4E00 <= cp <= 0x9FFF) or (0x3400 <= cp <= 0x4DBF)
+                or (0x20000 <= cp <= 0x2A6DF) or (0x2A700 <= cp <= 0x2B73F)
+                or (0x2B740 <= cp <= 0x2B81F) or (0x2B820 <= cp <= 0x2CEAF)
+                or (0xF900 <= cp <= 0xFAFF) or (0x2F800 <= cp <= 0x2FA1F)):
+            return True
+
+        return False
+
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xFFFD or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(' ')
+            else:
+                output.append(char)
+        return ''.join(output)
+
+
+class WordpieceTokenizer(object):
+    """Runs WordPiece tokenization."""
+
+    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+
+    def tokenize(self, text):
+        """
+        Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
+        tokenization using the given vocabulary.
+
+        For example, :obj:`input = "unaffable"` wil return as output :obj:`["un", "##aff", "##able"]`.
+
+        Args:
+          text: A single token or whitespace separated tokens. This should have
+            already been passed through `BasicTokenizer`.
+
+        Returns:
+          A list of wordpiece tokens.
+        """
+
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = ''.join(chars[start:end])
+                    if start > 0:
+                        substr = '##' + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
diff --git a/modelscope/models/nlp/structbert/tokenization_sbert_fast.py b/modelscope/models/nlp/structbert/tokenization_sbert_fast.py
new file mode 100644
index 00000000..b02039c6
--- /dev/null
+++ b/modelscope/models/nlp/structbert/tokenization_sbert_fast.py
@@ -0,0 +1,200 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Tokenization classes for Sbert. mainly copied from :module:`~transformers.tokenization_bert_fast`"""
+
+from typing import List, Optional, Tuple
+
+import json
+import transformers
+from tokenizers import normalizers
+from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
+
+from modelscope.utils.logger import get_logger
+from .tokenization_sbert import SbertTokenizer
+
+logger = get_logger(__name__)
+
+VOCAB_FILES_NAMES = {
+    'vocab_file': 'vocab.txt',
+    'tokenizer_file': 'tokenizer.json'
+}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    'vocab_file': {},
+    'tokenizer_file': {},
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    'chinese_sbert-large-std-512': 512,
+    'english_sbert-large-std-512': 512,
+}
+
+PRETRAINED_INIT_CONFIGURATION = {
+    'english_sbert-large-std-512': {
+        'do_lower_case': True
+    },
+}
+
+transformers.SLOW_TO_FAST_CONVERTERS[
+    'SbertTokenizer'] = transformers.SLOW_TO_FAST_CONVERTERS['BertTokenizer']
+
+
+class SbertTokenizerFast(PreTrainedTokenizerFast):
+    r"""
+    Construct a "fast" SBERT tokenizer (backed by HuggingFace's `tokenizers` library). Based on WordPiece.
+
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
+    methods. Users should refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (:obj:`str`):
+            File containing the vocabulary.
+        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not to lowercase the input when tokenizing.
+        unk_token (:obj:`str`, `optional`, defaults to :obj:`"[UNK]"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        clean_text (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not to clean the text before tokenization by removing any control characters and replacing all
+            whitespaces by the classic one.
+        tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see `this
+            issue <https://github.com/huggingface/transformers/issues/328>`__).
+        strip_accents: (:obj:`bool`, `optional`):
+            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
+            value for :obj:`lowercase` (as in the original BERT).
+        wordpieces_prefix: (:obj:`str`, `optional`, defaults to :obj:`"##"`):
+            The prefix for subwords.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    slow_tokenizer_class = SbertTokenizer
+
+    def __init__(self,
+                 vocab_file=None,
+                 tokenizer_file=None,
+                 do_lower_case=True,
+                 unk_token='[UNK]',
+                 sep_token='[SEP]',
+                 pad_token='[PAD]',
+                 cls_token='[CLS]',
+                 mask_token='[MASK]',
+                 tokenize_chinese_chars=True,
+                 strip_accents=None,
+                 **kwargs):
+        super().__init__(
+            vocab_file,
+            tokenizer_file=tokenizer_file,
+            do_lower_case=do_lower_case,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            tokenize_chinese_chars=tokenize_chinese_chars,
+            strip_accents=strip_accents,
+            **kwargs,
+        )
+
+        pre_tok_state = json.loads(
+            self.backend_tokenizer.normalizer.__getstate__())
+        if (pre_tok_state.get('lowercase', do_lower_case) != do_lower_case
+                or pre_tok_state.get('strip_accents',
+                                     strip_accents) != strip_accents):
+            pre_tok_class = getattr(normalizers, pre_tok_state.pop('type'))
+            pre_tok_state['lowercase'] = do_lower_case
+            pre_tok_state['strip_accents'] = strip_accents
+            self.backend_tokenizer.normalizer = pre_tok_class(**pre_tok_state)
+
+        self.do_lower_case = do_lower_case
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A SBERT sequence has the following format:
+
+        - single sequence: ``[CLS] X [SEP]``
+        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+        """
+        output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+
+        if token_ids_1:
+            output += token_ids_1 + [self.sep_token_id]
+
+        return output
+
+    def create_token_type_ids_from_sequences(
+            self,
+            token_ids_0: List[int],
+            token_ids_1: Optional[List[int]] = None) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A SBERT sequence
+        pair mask has the following format:
+
+        ::
+
+            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+            | first sequence    | second sequence |
+
+        If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            sequence(s).
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1
+                                                        + sep) * [1]
+
+    def save_vocabulary(self,
+                        save_directory: str,
+                        filename_prefix: Optional[str] = None) -> Tuple[str]:
+        files = self._tokenizer.model.save(
+            save_directory, name=filename_prefix)
+        return tuple(files)
diff --git a/modelscope/models/nlp/task_models/__init__.py b/modelscope/models/nlp/task_models/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/nlp/task_models/sequence_classification.py b/modelscope/models/nlp/task_models/sequence_classification.py
new file mode 100644
index 00000000..988f2917
--- /dev/null
+++ b/modelscope/models/nlp/task_models/sequence_classification.py
@@ -0,0 +1,86 @@
+import os
+from typing import Any, Dict
+
+import json
+import numpy as np
+
+from modelscope.metainfo import TaskModels
+from modelscope.models.builder import MODELS
+from modelscope.models.nlp.task_models.task_model import \
+    SingleBackboneTaskModelBase
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import Tasks
+
+__all__ = ['SequenceClassificationModel']
+
+
+@MODELS.register_module(
+    Tasks.sentiment_classification, module_name=TaskModels.text_classification)
+@MODELS.register_module(
+    Tasks.text_classification, module_name=TaskModels.text_classification)
+class SequenceClassificationModel(SingleBackboneTaskModelBase):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the sequence classification model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+        """
+        super().__init__(model_dir, *args, **kwargs)
+        if 'base_model_prefix' in kwargs:
+            self._base_model_prefix = kwargs['base_model_prefix']
+
+        backbone_cfg = self.cfg.backbone
+        head_cfg = self.cfg.head
+
+        # get the num_labels from label_mapping.json
+        self.id2label = {}
+        self.label_path = os.path.join(model_dir, 'label_mapping.json')
+        if os.path.exists(self.label_path):
+            with open(self.label_path) as f:
+                self.label_mapping = json.load(f)
+            self.id2label = {
+                idx: name
+                for name, idx in self.label_mapping.items()
+            }
+        head_cfg['num_labels'] = len(self.label_mapping)
+
+        self.build_backbone(backbone_cfg)
+        self.build_head(head_cfg)
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]:
+        outputs = super().forward(input)
+        sequence_output, pooled_output = self.extract_backbone_outputs(outputs)
+        outputs = self.head.forward(pooled_output)
+        if 'labels' in input:
+            loss = self.compute_loss(outputs, input['labels'])
+            outputs.update(loss)
+        return outputs
+
+    def extract_logits(self, outputs):
+        return outputs[OutputKeys.LOGITS].cpu().detach()
+
+    def extract_backbone_outputs(self, outputs):
+        sequence_output = None
+        pooled_output = None
+        if hasattr(self.backbone, 'extract_sequence_outputs'):
+            sequence_output = self.backbone.extract_sequence_outputs(outputs)
+        if hasattr(self.backbone, 'extract_pooled_outputs'):
+            pooled_output = self.backbone.extract_pooled_outputs(outputs)
+        return sequence_output, pooled_output
+
+    def compute_loss(self, outputs, labels):
+        loss = self.head.compute_loss(outputs, labels)
+        return loss
+
+    def postprocess(self, input, **kwargs):
+        logits = self.extract_logits(input)
+        probs = logits.softmax(-1).numpy()
+        pred = logits.argmax(-1).numpy()
+        logits = logits.numpy()
+        res = {
+            OutputKeys.PREDICTIONS: pred,
+            OutputKeys.PROBABILITIES: probs,
+            OutputKeys.LOGITS: logits
+        }
+        return res
diff --git a/modelscope/models/nlp/task_model.py b/modelscope/models/nlp/task_models/task_model.py
similarity index 98%
rename from modelscope/models/nlp/task_model.py
rename to modelscope/models/nlp/task_models/task_model.py
index e83c6604..104b4c32 100644
--- a/modelscope/models/nlp/task_model.py
+++ b/modelscope/models/nlp/task_models/task_model.py
@@ -11,8 +11,8 @@ from modelscope.models.base import TorchModel
 from modelscope.models.builder import build_backbone, build_head
 from modelscope.utils.config import ConfigDict
 from modelscope.utils.constant import Fields, Tasks
+from modelscope.utils.file_utils import func_receive_dict_inputs
 from modelscope.utils.logger import get_logger
-from modelscope.utils.utils import if_func_receive_dict_inputs
 
 logger = get_logger(__name__)
 
@@ -424,12 +424,15 @@ class SingleBackboneTaskModelBase(BaseTaskModel):
 
     def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
         """default forward method is the backbone-only forward"""
-        if if_func_receive_dict_inputs(self.backbone.forward):
+        if func_receive_dict_inputs(self.backbone.forward):
             outputs = self.backbone.forward(input)
         else:
             outputs = self.backbone.forward(**input)
         return outputs
 
+    def compute_loss(self, outputs: Dict[str, Any], labels):
+        raise NotImplementedError()
+
 
 class EncoderDecoderTaskModelBase(BaseTaskModel):
     """
@@ -472,13 +475,13 @@ class EncoderDecoderTaskModelBase(BaseTaskModel):
         return getattr(self, self._decoder_prefix)
 
     def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
-        if if_func_receive_dict_inputs(self.encoder_.forward):
+        if func_receive_dict_inputs(self.encoder_.forward):
             encoder_outputs = self.encoder_.forward(input)
         else:
             encoder_outputs = self.encoder_.forward(**input)
         decoder_inputs = self.project_decoder_inputs_and_mediate(
             input, encoder_outputs)
-        if if_func_receive_dict_inputs(self.decoder_.forward):
+        if func_receive_dict_inputs(self.decoder_.forward):
             outputs = self.decoder_.forward(decoder_inputs)
         else:
             outputs = self.decoder_.forward(**decoder_inputs)
diff --git a/modelscope/models/nlp/token_classification.py b/modelscope/models/nlp/token_classification.py
new file mode 100644
index 00000000..ebb1eda2
--- /dev/null
+++ b/modelscope/models/nlp/token_classification.py
@@ -0,0 +1,147 @@
+from abc import abstractmethod
+from typing import Dict
+
+import numpy as np
+import torch
+from torch import nn
+
+from modelscope.metainfo import Models
+from modelscope.models.base import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import Tasks
+from modelscope.utils.hub import parse_label_mapping
+from modelscope.utils.tensor_utils import (torch_nested_detach,
+                                           torch_nested_numpify)
+from .structbert import SbertPreTrainedModel
+
+__all__ = ['SbertForTokenClassification']
+
+
+class TokenClassification(TorchModel):
+
+    base_model_prefix: str = 'bert'
+
+    def __init__(self, config, model_dir):
+        super().__init__(model_dir)
+        self.num_labels = config.num_labels
+        self.config = config
+        setattr(self, self.base_model_prefix, self.build_base_model())
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None
+            else config.hidden_dropout_prob)
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+    @abstractmethod
+    def build_base_model(self):
+        """Build the backbone model.
+
+        Returns: the backbone instance.
+        """
+        pass
+
+    @property
+    def base_model(self):
+        return getattr(self, self.base_model_prefix)
+
+    def compute_loss(self, logits, labels, **kwargs):
+        """Compute loss.
+
+        For example, if backbone is pretrained model, there will be a 'attention_mask' parameter to skip
+        useless tokens.
+
+        Args:
+            logits: The logits from the classifier
+            labels: The labels
+            **kwargs: Other input params.
+
+        Returns: Loss.
+
+        """
+        pass
+
+    def forward(self, **kwargs):
+        labels = None
+        if OutputKeys.LABEL in kwargs:
+            labels = kwargs.pop(OutputKeys.LABEL)
+        elif OutputKeys.LABELS in kwargs:
+            labels = kwargs.pop(OutputKeys.LABELS)
+
+        outputs = self.base_model(**kwargs)
+        # base model should return the sequence_output as its first output
+        sequence_output = outputs[0]
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+        if labels is not None:
+            loss = self.compute_loss(logits, labels, **kwargs)
+            return {OutputKeys.LOGITS: logits, OutputKeys.LOSS: loss}
+        return {OutputKeys.LOGITS: logits}
+
+    def postprocess(self, input: Dict[str, np.ndarray],
+                    **kwargs) -> Dict[str, np.ndarray]:
+        logits = input[OutputKeys.LOGITS]
+        pred = torch.argmax(logits[0], dim=-1)
+        pred = torch_nested_numpify(torch_nested_detach(pred))
+        logits = torch_nested_numpify(torch_nested_detach(logits))
+        rst = {OutputKeys.PREDICTIONS: pred, OutputKeys.LOGITS: logits}
+        return rst
+
+
+@MODELS.register_module(Tasks.word_segmentation, module_name=Models.structbert)
+@MODELS.register_module(
+    Tasks.token_classification, module_name=Models.structbert)
+class SbertForTokenClassification(TokenClassification, SbertPreTrainedModel):
+
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+
+    def __init__(self, config, model_dir):
+        if hasattr(config, 'base_model_prefix'):
+            SbertForTokenClassification.base_model_prefix = config.base_model_prefix
+        super().__init__(config, model_dir)
+
+    def build_base_model(self):
+        from .structbert import SbertModel
+        return SbertModel(self.config, add_pooling_layer=False)
+
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                token_type_ids=None,
+                labels=None,
+                **kwargs):
+        return super().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            labels=labels)
+
+    def compute_loss(self, logits, labels, attention_mask=None, **kwargs):
+        loss_fct = nn.CrossEntropyLoss()
+        # Only keep active parts of the loss
+        if attention_mask is not None:
+            active_loss = attention_mask.view(-1) == 1
+            active_logits = logits.view(-1, self.num_labels)
+            active_labels = torch.where(
+                active_loss, labels.view(-1),
+                torch.tensor(loss_fct.ignore_index).type_as(labels))
+            return loss_fct(active_logits, active_labels)
+        else:
+            return loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        model_dir = kwargs.get('model_dir')
+        num_labels = kwargs.get('num_labels')
+        if num_labels is None:
+            label2id = parse_label_mapping(model_dir)
+            if label2id is not None and len(label2id) > 0:
+                num_labels = len(label2id)
+
+        model_args = {} if num_labels is None else {'num_labels': num_labels}
+        return super(SbertPreTrainedModel,
+                     SbertForTokenClassification).from_pretrained(
+                         pretrained_model_name_or_path=kwargs.get('model_dir'),
+                         model_dir=kwargs.get('model_dir'),
+                         **model_args)
diff --git a/modelscope/models/nlp/veco/__init__.py b/modelscope/models/nlp/veco/__init__.py
new file mode 100644
index 00000000..0fe786fd
--- /dev/null
+++ b/modelscope/models/nlp/veco/__init__.py
@@ -0,0 +1,43 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .configuration_veco import VecoConfig
+    from .modeling_veco import (VecoForMaskedLM, VecoForSequenceClassification,
+                                VecoModel)
+    from .tokenization_veco import VecoTokenizer
+    from .tokenization_veco_fast import VecoTokenizerFast
+else:
+    _import_structure = {
+        'configuration_veco': ['VecoConfig'],
+        'modeling_veco':
+        ['VecoForMaskedLM', 'VecoForSequenceClassification', 'VecoModel'],
+        'tokenization_veco': ['VecoTokenizer'],
+        'tokenization_veco_fast': ['VecoTokenizerFast'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/nlp/veco/configuration_veco.py b/modelscope/models/nlp/veco/configuration_veco.py
new file mode 100644
index 00000000..396755dc
--- /dev/null
+++ b/modelscope/models/nlp/veco/configuration_veco.py
@@ -0,0 +1,33 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright 2018 The Google AI Language Team Authors.
+# Copyright 2020 The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Veco configuration, mainly copied from :class:`~transformers.configuration_xlm_roberta` """
+
+from transformers import RobertaConfig
+
+from modelscope.utils import logger as logging
+
+logger = logging.get_logger(__name__)
+
+
+class VecoConfig(RobertaConfig):
+    """
+    This class overrides [`RobertaConfig`]. Please check the superclass for the appropriate
+    documentation alongside usage examples.
+    """
+
+    model_type = 'veco'
diff --git a/modelscope/models/nlp/veco/modeling_veco.py b/modelscope/models/nlp/veco/modeling_veco.py
new file mode 100644
index 00000000..b519c236
--- /dev/null
+++ b/modelscope/models/nlp/veco/modeling_veco.py
@@ -0,0 +1,143 @@
+# Copyright 2019 Facebook AI Research and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Veco model. mainly copied from :module:`~transformers.modeling_xlm_roberta`"""
+
+from transformers import (RobertaForMaskedLM, RobertaForMultipleChoice,
+                          RobertaForQuestionAnswering,
+                          RobertaForSequenceClassification,
+                          RobertaForTokenClassification, RobertaModel)
+from transformers.file_utils import add_start_docstrings
+
+from modelscope.metainfo import Models
+from modelscope.models.builder import BACKBONES
+from modelscope.utils import logger as logging
+from modelscope.utils.constant import Fields
+from .configuration_veco import VecoConfig
+
+logger = logging.get_logger(__name__)
+
+VECO_PRETRAINED_MODEL_ARCHIVE_LIST = []
+
+VECO_START_DOCSTRING = r"""
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+
+    Parameters:
+        config ([`VecoConfig`]): Model configuration class with all the parameters of the
+            model. Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
+            weights.
+"""
+
+
+@add_start_docstrings(
+    'The bare Veco Model transformer outputting raw hidden-states without any specific head on top.',
+    VECO_START_DOCSTRING,
+)
+class VecoModel(RobertaModel):
+    """
+    This class overrides [`RobertaModel`]. Please check the superclass for the appropriate
+    documentation alongside usage examples.
+    """
+
+    config_class = VecoConfig
+
+
+@add_start_docstrings(
+    """
+    Veco Model transformer with a sequence classification/regression head on top (a linear layer on top of the
+    pooled output) e.g. for GLUE tasks.
+    """,
+    VECO_START_DOCSTRING,
+)
+class VecoForSequenceClassification(RobertaForSequenceClassification):
+    """
+    This class overrides [`RobertaForSequenceClassification`]. Please check the superclass for the
+    appropriate documentation alongside usage examples.
+    """
+
+    config_class = VecoConfig
+
+
+@add_start_docstrings(
+    """
+    Veco Model transformer with a masked language model head on top (a linear layer on top of the
+    pooled output).
+    """,
+    VECO_START_DOCSTRING,
+)
+class VecoForMaskedLM(RobertaForMaskedLM):
+    """
+    This class overrides [`RobertaForMaskedLM`]. Please check the superclass for the
+    appropriate documentation alongside usage examples.
+    """
+
+    config_class = VecoConfig
+
+
+@add_start_docstrings(
+    """
+    Veco Model with a multiple choice classification head on top (a linear layer on top of the pooled output and
+    a softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    VECO_START_DOCSTRING,
+)
+class VecoForMultipleChoice(RobertaForMultipleChoice):
+    """
+    This class overrides [`RobertaForMultipleChoice`]. Please check the superclass for the
+    appropriate documentation alongside usage examples.
+    """
+
+    config_class = VecoConfig
+
+
+@add_start_docstrings(
+    """
+    Veco Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
+    for Named-Entity-Recognition (NER) tasks.
+    """,
+    VECO_START_DOCSTRING,
+)
+class VecoForTokenClassification(RobertaForTokenClassification):
+    """
+    This class overrides [`RobertaForTokenClassification`]. Please check the superclass for the
+    appropriate documentation alongside usage examples.
+    """
+
+    config_class = VecoConfig
+
+
+@add_start_docstrings(
+    """
+    Veco Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
+    linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    VECO_START_DOCSTRING,
+)
+class VecoForQuestionAnswering(RobertaForQuestionAnswering):
+    """
+    This class overrides [`RobertaForQuestionAnswering`]. Please check the superclass for the
+    appropriate documentation alongside usage examples.
+    """
+
+    config_class = VecoConfig
diff --git a/modelscope/models/nlp/veco/tokenization_veco.py b/modelscope/models/nlp/veco/tokenization_veco.py
new file mode 100644
index 00000000..21711456
--- /dev/null
+++ b/modelscope/models/nlp/veco/tokenization_veco.py
@@ -0,0 +1,321 @@
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+"""Tokenization classes for Veco. mainly copied from :module:`~transformers.tokenization_xlm_roberta`"""
+
+import os
+from shutil import copyfile
+from typing import Any, Dict, List, Optional, Tuple
+
+import sentencepiece as spm
+from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
+
+from modelscope.utils import logger as logging
+
+logger = logging.get_logger(__name__)
+
+SPIECE_UNDERLINE = '▁'
+
+VOCAB_FILES_NAMES = {'vocab_file': 'sentencepiece.bpe.model'}
+
+PRETRAINED_VOCAB_FILES_MAP = {'vocab_file': {}}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {}
+
+
+class VecoTokenizer(PreTrainedTokenizer):
+    """
+    Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
+    [SentencePiece](https://github.com/google/sentencepiece).
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
+    Users should refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the end of
+            sequence. The token used is the `sep_token`.
+
+            </Tip>
+
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        cls_token (`str`, *optional*, defaults to `"<s>"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
+            Additional special tokens used by the tokenizer.
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method.
+            The [Python wrapper for SentencePiece](https://github.com/google/sentencepiece/tree/master/python)
+            can be used, among other things, to set:
+
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
+
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+                using forward-filtering-and-backward-sampling algorithm.
+
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+              BPE-dropout.
+
+    Attributes:
+        sp_model (`SentencePieceProcessor`):
+            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ['input_ids', 'attention_mask']
+
+    def __init__(self,
+                 vocab_file,
+                 bos_token='<s>',
+                 eos_token='</s>',
+                 sep_token='</s>',
+                 cls_token='<s>',
+                 unk_token='<unk>',
+                 pad_token='<pad>',
+                 mask_token='<mask>',
+                 sp_model_kwargs: Optional[Dict[str, Any]] = None,
+                 **kwargs) -> None:
+        # Mask token behave like a normal word, i.e. include the space before it
+        mask_token = AddedToken(
+            mask_token, lstrip=True, rstrip=False) if isinstance(
+                mask_token, str) else mask_token
+
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            sp_model_kwargs=self.sp_model_kwargs,
+            **kwargs,
+        )
+
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(str(vocab_file))
+        self.vocab_file = vocab_file
+
+        # Original fairseq vocab and spm vocab must be "aligned":
+        # Vocab    |    0    |    1    |   2    |    3    |  4  |  5  |  6  |   7   |   8   |  9
+        # -------- | ------- | ------- | ------ | ------- | --- | --- | --- | ----- | ----- | ----
+        # fairseq  | '<s>'   | '<pad>' | '</s>' | '<unk>' | ',' | '.' | '▁' | 's'   | '▁de' | '-'
+        # spm      | '<unk>' | '<s>'   | '</s>' | ','     | '.' | '▁' | 's' | '▁de' | '-'   | '▁a'
+
+        # Mimic fairseq token-to-id alignment for the first 4 token
+        self.fairseq_tokens_to_ids = {
+            '<s>': 0,
+            '<pad>': 1,
+            '</s>': 2,
+            '<unk>': 3
+        }
+
+        # The first "real" token "," has position 4 in the original fairseq vocab and position 3 in the spm vocab
+        self.fairseq_offset = 1
+
+        self.fairseq_tokens_to_ids['<mask>'] = len(
+            self.sp_model) + self.fairseq_offset
+        self.fairseq_ids_to_tokens = {
+            v: k
+            for k, v in self.fairseq_tokens_to_ids.items()
+        }
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state['sp_model'] = None
+        state['sp_model_proto'] = self.sp_model.serialized_model_proto()
+        return state
+
+    def __setstate__(self, d):
+        self.__dict__ = d
+
+        # for backward compatibility
+        if not hasattr(self, 'sp_model_kwargs'):
+            self.sp_model_kwargs = {}
+
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
+
+    def build_inputs_with_special_tokens(
+            self,
+            token_ids_0: List[int],
+            token_ids_1: Optional[List[int]] = None) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. An Veco sequence has the following format:
+
+        - single sequence: `<s> X </s>`
+        - pair of sequences: `<s> A </s></s> B </s>`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
+
+    def get_special_tokens_mask(
+            self,
+            token_ids_0: List[int],
+            token_ids_1: Optional[List[int]] = None,
+            already_has_special_tokens: bool = False) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0,
+                token_ids_1=token_ids_1,
+                already_has_special_tokens=True)
+
+        if token_ids_1 is None:
+            return [1] + ([0] * len(token_ids_0)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1, 1] + (
+            [0] * len(token_ids_1)) + [1]
+
+    def create_token_type_ids_from_sequences(
+            self,
+            token_ids_0: List[int],
+            token_ids_1: Optional[List[int]] = None) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. Veco does
+        not make use of token type ids, therefore a list of zeros is returned.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of zeros.
+
+        """
+
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
+
+    @property
+    def vocab_size(self):
+        return len(
+            self.sp_model) + self.fairseq_offset + 1  # Add the <mask> token
+
+    def get_vocab(self):
+        vocab = {
+            self.convert_ids_to_tokens(i): i
+            for i in range(self.vocab_size)
+        }
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def _tokenize(self, text: str) -> List[str]:
+        return self.sp_model.encode(text, out_type=str)
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        if token in self.fairseq_tokens_to_ids:
+            return self.fairseq_tokens_to_ids[token]
+        spm_id = self.sp_model.PieceToId(token)
+
+        # Need to return unknown token if the SP model returned 0
+        return spm_id + self.fairseq_offset if spm_id else self.unk_token_id
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        if index in self.fairseq_ids_to_tokens:
+            return self.fairseq_ids_to_tokens[index]
+        return self.sp_model.IdToPiece(index - self.fairseq_offset)
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (strings for sub-words) in a single string."""
+        out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip()
+        return out_string
+
+    def save_vocabulary(self,
+                        save_directory: str,
+                        filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(
+                f'Vocabulary path ({save_directory}) should be a directory')
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + '-' if filename_prefix else '')
+            + VOCAB_FILES_NAMES['vocab_file'])
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+
+        return (out_vocab_file, )
diff --git a/modelscope/models/nlp/veco/tokenization_veco_fast.py b/modelscope/models/nlp/veco/tokenization_veco_fast.py
new file mode 100644
index 00000000..3edae0e7
--- /dev/null
+++ b/modelscope/models/nlp/veco/tokenization_veco_fast.py
@@ -0,0 +1,213 @@
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+"""Fast Tokenization classes for Veco. mainly copied from :module:`~transformers.tokenization_xlm_roberta_fast`"""
+
+import os
+from shutil import copyfile
+from typing import List, Optional, Tuple
+
+import transformers
+from transformers.file_utils import is_sentencepiece_available
+from transformers.tokenization_utils import AddedToken
+from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
+
+from modelscope.utils import logger as logging
+
+if is_sentencepiece_available():
+    from .tokenization_veco import VecoTokenizer
+else:
+    VecoTokenizer = None
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {
+    'vocab_file': 'sentencepiece.bpe.model',
+    'tokenizer_file': 'tokenizer.json'
+}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    'vocab_file': {},
+    'tokenizer_file': {},
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {}
+
+transformers.SLOW_TO_FAST_CONVERTERS[
+    'VecoTokenizer'] = transformers.SLOW_TO_FAST_CONVERTERS[
+        'XLMRobertaTokenizer']
+
+
+class VecoTokenizerFast(PreTrainedTokenizerFast):
+    """
+    Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`].
+    Based on [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models).
+
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main
+    methods. Users should refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the end of
+            sequence. The token used is the `sep_token`.
+
+            </Tip>
+
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        cls_token (`str`, *optional*, defaults to `"<s>"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
+            Additional special tokens used by the tokenizer.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ['input_ids', 'attention_mask']
+    slow_tokenizer_class = VecoTokenizer
+
+    def __init__(self,
+                 vocab_file=None,
+                 tokenizer_file=None,
+                 bos_token='<s>',
+                 eos_token='</s>',
+                 sep_token='</s>',
+                 cls_token='<s>',
+                 unk_token='<unk>',
+                 pad_token='<pad>',
+                 mask_token='<mask>',
+                 **kwargs):
+        # Mask token behave like a normal word, i.e. include the space before it
+        mask_token = AddedToken(
+            mask_token, lstrip=True, rstrip=False) if isinstance(
+                mask_token, str) else mask_token
+
+        super().__init__(
+            vocab_file,
+            tokenizer_file=tokenizer_file,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            **kwargs,
+        )
+
+        self.vocab_file = vocab_file
+        self.can_save_slow_tokenizer = False if not self.vocab_file else True
+
+    def build_inputs_with_special_tokens(
+            self,
+            token_ids_0: List[int],
+            token_ids_1: Optional[List[int]] = None) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. An Veco sequence has the following format:
+
+        - single sequence: `<s> X </s>`
+        - pair of sequences: `<s> A </s></s> B </s>`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
+
+    def create_token_type_ids_from_sequences(
+            self,
+            token_ids_0: List[int],
+            token_ids_1: Optional[List[int]] = None) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. Veco does
+        not make use of token type ids, therefore a list of zeros is returned.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of zeros.
+
+        """
+
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
+
+    def save_vocabulary(self,
+                        save_directory: str,
+                        filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not self.can_save_slow_tokenizer:
+            raise ValueError(
+                'Your fast tokenizer does not have the necessary information to save the vocabulary for a slow '
+                'tokenizer.')
+
+        if not os.path.isdir(save_directory):
+            logger.error(
+                f'Vocabulary path ({save_directory}) should be a directory.')
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + '-' if filename_prefix else '')
+            + VOCAB_FILES_NAMES['vocab_file'])
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+
+        return (out_vocab_file, )
diff --git a/modelscope/msdatasets/ms_dataset.py b/modelscope/msdatasets/ms_dataset.py
index 8174d054..f6896e4a 100644
--- a/modelscope/msdatasets/ms_dataset.py
+++ b/modelscope/msdatasets/ms_dataset.py
@@ -517,3 +517,10 @@ class MsDataset:
     def to_hf_dataset(self) -> Dataset:
         self._hf_ds.reset_format()
         return self._hf_ds
+
+    @staticmethod
+    def interleave_datasets(datasets: List[Any],
+                            probabilities: Optional[List[float]] = None,
+                            seed: Optional[int] = None):
+        from datasets import interleave_datasets
+        return interleave_datasets(datasets, probabilities, seed)
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index 0937e441..a82f6ed5 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -9,6 +9,7 @@ class OutputKeys(object):
     SCORES = 'scores'
     LABEL = 'label'
     LABELS = 'labels'
+    INPUT_IDS = 'input_ids'
     LABEL_POS = 'label_pos'
     POSES = 'poses'
     CAPTION = 'caption'
diff --git a/modelscope/pipelines/nlp/__init__.py b/modelscope/pipelines/nlp/__init__.py
index e6a35efc..1111f0d3 100644
--- a/modelscope/pipelines/nlp/__init__.py
+++ b/modelscope/pipelines/nlp/__init__.py
@@ -9,9 +9,8 @@ if TYPE_CHECKING:
     from .dialog_state_tracking_pipeline import DialogStateTrackingPipeline
     from .fill_mask_pipeline import FillMaskPipeline
     from .named_entity_recognition_pipeline import NamedEntityRecognitionPipeline
-    from .nli_pipeline import NLIPipeline
-    from .sentence_similarity_pipeline import SentenceSimilarityPipeline
-    from .sentiment_classification_pipeline import SentimentClassificationPipeline
+    from .pair_sentence_classification_pipeline import PairSentenceClassificationPipeline
+    from .single_sentence_classification_pipeline import SingleSentenceClassificationPipeline
     from .sequence_classification_pipeline import SequenceClassificationPipeline
     from .text_generation_pipeline import TextGenerationPipeline
     from .translation_pipeline import TranslationPipeline
@@ -28,10 +27,10 @@ else:
         'dialog_modeling_pipeline': ['DialogModelingPipeline'],
         'dialog_state_tracking_pipeline': ['DialogStateTrackingPipeline'],
         'fill_mask_pipeline': ['FillMaskPipeline'],
-        'nli_pipeline': ['NLIPipeline'],
-        'sentence_similarity_pipeline': ['SentenceSimilarityPipeline'],
-        'sentiment_classification_pipeline':
-        ['SentimentClassificationPipeline'],
+        'single_sentence_classification_pipeline':
+        ['SingleSentenceClassificationPipeline'],
+        'pair_sentence_classification_pipeline':
+        ['PairSentenceClassificationPipeline'],
         'sequence_classification_pipeline': ['SequenceClassificationPipeline'],
         'text_generation_pipeline': ['TextGenerationPipeline'],
         'word_segmentation_pipeline': ['WordSegmentationPipeline'],
diff --git a/modelscope/pipelines/nlp/fill_mask_pipeline.py b/modelscope/pipelines/nlp/fill_mask_pipeline.py
index 27c34817..e4affe40 100644
--- a/modelscope/pipelines/nlp/fill_mask_pipeline.py
+++ b/modelscope/pipelines/nlp/fill_mask_pipeline.py
@@ -5,11 +5,10 @@ import torch
 
 from modelscope.metainfo import Pipelines
 from modelscope.models import Model
-from modelscope.models.nlp.masked_language import MaskedLanguageModelBase
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline, Tensor
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import FillMaskPreprocessor
+from modelscope.preprocessors import FillMaskPreprocessor, Preprocessor
 from modelscope.utils.config import Config
 from modelscope.utils.constant import ModelFile, Tasks
 
@@ -21,18 +20,18 @@ _type_map = {'veco': 'roberta', 'sbert': 'bert'}
 class FillMaskPipeline(Pipeline):
 
     def __init__(self,
-                 model: Union[MaskedLanguageModelBase, str],
-                 preprocessor: Optional[FillMaskPreprocessor] = None,
-                 first_sequence='sentense',
+                 model: Union[Model, str],
+                 preprocessor: Optional[Preprocessor] = None,
+                 first_sequence='sentence',
                  **kwargs):
         """use `model` and `preprocessor` to create a nlp fill mask pipeline for prediction
 
         Args:
-            model (MaskedLanguageModelBase): a model instance
-            preprocessor (FillMaskPreprocessor): a preprocessor instance
+            model (Model): a model instance
+            preprocessor (Preprocessor): a preprocessor instance
         """
         fill_mask_model = model if isinstance(
-            model, MaskedLanguageModelBase) else Model.from_pretrained(model)
+            model, Model) else Model.from_pretrained(model)
 
         if preprocessor is None:
             preprocessor = FillMaskPreprocessor(
@@ -73,7 +72,7 @@ class FillMaskPipeline(Pipeline):
     def forward(self, inputs: Dict[str, Any],
                 **forward_params) -> Dict[str, Any]:
         with torch.no_grad():
-            return super().forward(inputs, **forward_params)
+            return self.model(inputs, **forward_params)
 
     def postprocess(self, inputs: Dict[str, Tensor]) -> Dict[str, Tensor]:
         """process the prediction results
@@ -85,8 +84,8 @@ class FillMaskPipeline(Pipeline):
             Dict[str, str]: the prediction results
         """
         import numpy as np
-        logits = inputs['logits'].detach().cpu().numpy()
-        input_ids = inputs['input_ids'].detach().cpu().numpy()
+        logits = inputs[OutputKeys.LOGITS].detach().cpu().numpy()
+        input_ids = inputs[OutputKeys.INPUT_IDS].detach().cpu().numpy()
         pred_ids = np.argmax(logits, axis=-1)
         model_type = self.model.config.model_type
         process_type = model_type if model_type in self.mask_id else _type_map[
diff --git a/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py b/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
index 65334144..29c439fc 100644
--- a/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
+++ b/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
@@ -4,11 +4,10 @@ import torch
 
 from modelscope.metainfo import Pipelines
 from modelscope.models import Model
-from modelscope.models.nlp import TransformerCRFForNamedEntityRecognition
 from modelscope.outputs import OutputKeys
-from modelscope.pipelines.base import Pipeline, Tensor
+from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import NERPreprocessor
+from modelscope.preprocessors import NERPreprocessor, Preprocessor
 from modelscope.utils.constant import Tasks
 
 __all__ = ['NamedEntityRecognitionPipeline']
@@ -20,13 +19,12 @@ __all__ = ['NamedEntityRecognitionPipeline']
 class NamedEntityRecognitionPipeline(Pipeline):
 
     def __init__(self,
-                 model: Union[TransformerCRFForNamedEntityRecognition, str],
-                 preprocessor: Optional[NERPreprocessor] = None,
+                 model: Union[Model, str],
+                 preprocessor: Optional[Preprocessor] = None,
                  **kwargs):
 
         model = model if isinstance(model,
-                                    TransformerCRFForNamedEntityRecognition
-                                    ) else Model.from_pretrained(model)
+                                    Model) else Model.from_pretrained(model)
         if preprocessor is None:
             preprocessor = NERPreprocessor(model.model_dir)
         model.eval()
diff --git a/modelscope/pipelines/nlp/nli_pipeline.py b/modelscope/pipelines/nlp/nli_pipeline.py
deleted file mode 100644
index 200f44e4..00000000
--- a/modelscope/pipelines/nlp/nli_pipeline.py
+++ /dev/null
@@ -1,73 +0,0 @@
-import uuid
-from typing import Any, Dict, Union
-
-import numpy as np
-import torch
-
-from modelscope.metainfo import Pipelines
-from modelscope.models import Model
-from modelscope.models.nlp import SbertForNLI
-from modelscope.outputs import OutputKeys
-from modelscope.pipelines.base import Pipeline
-from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import NLIPreprocessor
-from modelscope.utils.constant import Tasks
-
-__all__ = ['NLIPipeline']
-
-
-@PIPELINES.register_module(Tasks.nli, module_name=Pipelines.nli)
-class NLIPipeline(Pipeline):
-
-    def __init__(self,
-                 model: Union[SbertForNLI, str],
-                 preprocessor: NLIPreprocessor = None,
-                 first_sequence='first_sequence',
-                 second_sequence='second_sequence',
-                 **kwargs):
-        """use `model` and `preprocessor` to create a nlp text classification pipeline for prediction
-
-        Args:
-            model (SbertForNLI): a model instance
-            preprocessor (NLIPreprocessor): a preprocessor instance
-        """
-        assert isinstance(model, str) or isinstance(model, SbertForNLI), \
-            'model must be a single str or SbertForNLI'
-        model = model if isinstance(
-            model, SbertForNLI) else Model.from_pretrained(model)
-        if preprocessor is None:
-            preprocessor = NLIPreprocessor(
-                model.model_dir,
-                first_sequence=first_sequence,
-                second_sequence=second_sequence)
-        model.eval()
-        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
-        assert len(model.id2label) > 0
-
-    def forward(self, inputs: Dict[str, Any],
-                **forward_params) -> Dict[str, Any]:
-        with torch.no_grad():
-            return super().forward(inputs, **forward_params)
-
-    def postprocess(self,
-                    inputs: Dict[str, Any],
-                    topk: int = 5) -> Dict[str, str]:
-        """process the prediction results
-
-        Args:
-            inputs (Dict[str, Any]): _description_
-
-        Returns:
-            Dict[str, str]: the prediction results
-        """
-
-        probs = inputs['probabilities'][0]
-        num_classes = probs.shape[0]
-        topk = min(topk, num_classes)
-        top_indices = np.argpartition(probs, -topk)[-topk:]
-        cls_ids = top_indices[np.argsort(probs[top_indices])]
-        probs = probs[cls_ids].tolist()
-
-        cls_names = [self.model.id2label[cid] for cid in cls_ids]
-
-        return {OutputKeys.SCORES: probs, OutputKeys.LABELS: cls_names}
diff --git a/modelscope/pipelines/nlp/pair_sentence_classification_pipeline.py b/modelscope/pipelines/nlp/pair_sentence_classification_pipeline.py
new file mode 100644
index 00000000..0804ec8c
--- /dev/null
+++ b/modelscope/pipelines/nlp/pair_sentence_classification_pipeline.py
@@ -0,0 +1,37 @@
+from typing import Union
+
+from modelscope.models.base import Model
+from ...metainfo import Pipelines
+from ...preprocessors import (PairSentenceClassificationPreprocessor,
+                              Preprocessor)
+from ...utils.constant import Tasks
+from ..builder import PIPELINES
+from .sequence_classification_pipeline_base import \
+    SequenceClassificationPipelineBase
+
+__all__ = ['PairSentenceClassificationPipeline']
+
+
+@PIPELINES.register_module(Tasks.nli, module_name=Pipelines.nli)
+@PIPELINES.register_module(
+    Tasks.sentence_similarity, module_name=Pipelines.sentence_similarity)
+class PairSentenceClassificationPipeline(SequenceClassificationPipelineBase):
+
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: Preprocessor = None,
+                 first_sequence='first_sequence',
+                 second_sequence='second_sequence',
+                 **kwargs):
+        """use `model` and `preprocessor` to create a nlp pair sentence classification pipeline for prediction
+
+        Args:
+            model (Model): a model instance
+            preprocessor (Preprocessor): a preprocessor instance
+        """
+        if preprocessor is None:
+            preprocessor = PairSentenceClassificationPreprocessor(
+                model.model_dir if isinstance(model, Model) else model,
+                first_sequence=first_sequence,
+                second_sequence=second_sequence)
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
diff --git a/modelscope/pipelines/nlp/sentence_similarity_pipeline.py b/modelscope/pipelines/nlp/sentence_similarity_pipeline.py
deleted file mode 100644
index c09e2115..00000000
--- a/modelscope/pipelines/nlp/sentence_similarity_pipeline.py
+++ /dev/null
@@ -1,73 +0,0 @@
-from typing import Any, Dict, Union
-
-import numpy as np
-import torch
-
-from modelscope.metainfo import Pipelines
-from modelscope.models import Model
-from modelscope.models.nlp import SbertForSentenceSimilarity
-from modelscope.outputs import OutputKeys
-from modelscope.pipelines.base import Input, Pipeline
-from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import SentenceSimilarityPreprocessor
-from modelscope.utils.constant import Tasks
-
-__all__ = ['SentenceSimilarityPipeline']
-
-
-@PIPELINES.register_module(
-    Tasks.sentence_similarity, module_name=Pipelines.sentence_similarity)
-class SentenceSimilarityPipeline(Pipeline):
-
-    def __init__(self,
-                 model: Union[Model, str],
-                 preprocessor: SentenceSimilarityPreprocessor = None,
-                 first_sequence='first_sequence',
-                 second_sequence='second_sequence',
-                 **kwargs):
-        """use `model` and `preprocessor` to create a nlp sentence similarity pipeline for prediction
-
-        Args:
-            model (SbertForSentenceSimilarity): a model instance
-            preprocessor (SentenceSimilarityPreprocessor): a preprocessor instance
-        """
-        assert isinstance(model, str) or isinstance(model, SbertForSentenceSimilarity), \
-            'model must be a single str or SbertForSentenceSimilarity'
-        sc_model = model if isinstance(
-            model,
-            SbertForSentenceSimilarity) else Model.from_pretrained(model)
-        if preprocessor is None:
-            preprocessor = SentenceSimilarityPreprocessor(
-                sc_model.model_dir,
-                first_sequence=first_sequence,
-                second_sequence=second_sequence)
-        sc_model.eval()
-        super().__init__(model=sc_model, preprocessor=preprocessor, **kwargs)
-
-        assert hasattr(self.model, 'id2label'), \
-            'id2label map should be initalizaed in init function.'
-
-    def forward(self, inputs: Dict[str, Any],
-                **forward_params) -> Dict[str, Any]:
-        with torch.no_grad():
-            return super().forward(inputs, **forward_params)
-
-    def postprocess(self, inputs: Dict[str, Any],
-                    **postprocess_params) -> Dict[str, str]:
-        """process the prediction results
-
-        Args:
-            inputs (Dict[str, Any]): _description_
-
-        Returns:
-            Dict[str, str]: the prediction results
-        """
-
-        probs = inputs['probabilities'][0]
-        num_classes = probs.shape[0]
-        top_indices = np.argpartition(probs, -num_classes)[-num_classes:]
-        cls_ids = top_indices[np.argsort(-probs[top_indices], axis=-1)]
-        probs = probs[cls_ids].tolist()
-        cls_names = [self.model.id2label[cid] for cid in cls_ids]
-        b = 0
-        return {OutputKeys.SCORES: probs[b], OutputKeys.LABELS: cls_names[b]}
diff --git a/modelscope/pipelines/nlp/sentiment_classification_pipeline.py b/modelscope/pipelines/nlp/sentiment_classification_pipeline.py
deleted file mode 100644
index 8e57d77b..00000000
--- a/modelscope/pipelines/nlp/sentiment_classification_pipeline.py
+++ /dev/null
@@ -1,74 +0,0 @@
-from typing import Any, Dict, Union
-
-import numpy as np
-import torch
-
-from modelscope.metainfo import Pipelines
-from modelscope.models import Model
-from modelscope.models.nlp import SequenceClassificationModel
-from modelscope.outputs import OutputKeys
-from modelscope.pipelines.base import Pipeline
-from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import SentimentClassificationPreprocessor
-from modelscope.utils.constant import Tasks
-
-__all__ = ['SentimentClassificationPipeline']
-
-
-@PIPELINES.register_module(
-    Tasks.sentiment_classification,
-    module_name=Pipelines.sentiment_classification)
-class SentimentClassificationPipeline(Pipeline):
-
-    def __init__(self,
-                 model: Union[SequenceClassificationModel, str],
-                 preprocessor: SentimentClassificationPreprocessor = None,
-                 first_sequence='first_sequence',
-                 second_sequence='second_sequence',
-                 **kwargs):
-        """use `model` and `preprocessor` to create a nlp text classification pipeline for prediction
-
-        Args:
-            model (SequenceClassificationModel): a model instance
-            preprocessor (SentimentClassificationPreprocessor): a preprocessor instance
-        """
-        assert isinstance(model, str) or isinstance(model, SequenceClassificationModel), \
-            'model must be a single str or SentimentClassification'
-        model = model if isinstance(
-            model,
-            SequenceClassificationModel) else Model.from_pretrained(model)
-        if preprocessor is None:
-            preprocessor = SentimentClassificationPreprocessor(
-                model.model_dir,
-                first_sequence=first_sequence,
-                second_sequence=second_sequence)
-        model.eval()
-        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
-        assert len(model.id2label) > 0
-
-    def forward(self, inputs: Dict[str, Any],
-                **forward_params) -> Dict[str, Any]:
-        with torch.no_grad():
-            return super().forward(inputs, **forward_params)
-
-    def postprocess(self,
-                    inputs: Dict[str, Any],
-                    topk: int = 5) -> Dict[str, str]:
-        """process the prediction results
-
-        Args:
-            inputs (Dict[str, Any]): _description_
-
-        Returns:
-            Dict[str, str]: the prediction results
-        """
-
-        probs = inputs['probabilities'][0]
-        num_classes = probs.shape[0]
-        topk = min(topk, num_classes)
-        top_indices = np.argpartition(probs, -topk)[-topk:]
-        cls_ids = top_indices[np.argsort(probs[top_indices])]
-        probs = probs[cls_ids].tolist()
-
-        cls_names = [self.model.id2label[cid] for cid in cls_ids]
-        return {OutputKeys.SCORES: probs, OutputKeys.LABELS: cls_names}
diff --git a/modelscope/pipelines/nlp/sequence_classification_pipeline_base.py b/modelscope/pipelines/nlp/sequence_classification_pipeline_base.py
new file mode 100644
index 00000000..ad31bfbd
--- /dev/null
+++ b/modelscope/pipelines/nlp/sequence_classification_pipeline_base.py
@@ -0,0 +1,60 @@
+from typing import Any, Dict, Union
+
+import numpy as np
+import torch
+
+from modelscope.models.base import Model
+from modelscope.outputs import OutputKeys
+from ...preprocessors import Preprocessor
+from ..base import Pipeline
+
+
+class SequenceClassificationPipelineBase(Pipeline):
+
+    def __init__(self, model: Union[Model, str], preprocessor: Preprocessor,
+                 **kwargs):
+        """use `model` and `preprocessor` to create a nlp text classification pipeline for prediction
+
+        Args:
+            model (str or Model): a model instance
+            preprocessor (Preprocessor): a preprocessor instance
+        """
+        assert isinstance(model, str) or isinstance(model, Model), \
+            'model must be a single str or Model'
+        model = model if isinstance(model,
+                                    Model) else Model.from_pretrained(model)
+        assert preprocessor is not None
+        model.eval()
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+        self.id2label = kwargs.get('id2label')
+        if self.id2label is None and hasattr(self.preprocessor, 'id2label'):
+            self.id2label = self.preprocessor.id2label
+        assert self.id2label is not None, 'Cannot convert id to the original label, please pass in the mapping ' \
+                                          'as a parameter or make sure the preprocessor has the attribute.'
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        with torch.no_grad():
+            return self.model(inputs, **forward_params)
+
+    def postprocess(self,
+                    inputs: Dict[str, Any],
+                    topk: int = 5) -> Dict[str, str]:
+        """process the prediction results
+
+        Args:
+            inputs (Dict[str, Any]): _description_
+            topk (int): The topk probs to take
+        Returns:
+            Dict[str, str]: the prediction results
+        """
+
+        probs = inputs[OutputKeys.PROBABILITIES][0]
+        num_classes = probs.shape[0]
+        topk = min(topk, num_classes)
+        top_indices = np.argpartition(probs, -topk)[-topk:]
+        cls_ids = top_indices[np.argsort(probs[top_indices])]
+        probs = probs[cls_ids].tolist()
+
+        cls_names = [self.id2label[cid] for cid in cls_ids]
+        return {OutputKeys.SCORES: probs, OutputKeys.LABELS: cls_names}
diff --git a/modelscope/pipelines/nlp/single_sentence_classification_pipeline.py b/modelscope/pipelines/nlp/single_sentence_classification_pipeline.py
new file mode 100644
index 00000000..8e0b4fe0
--- /dev/null
+++ b/modelscope/pipelines/nlp/single_sentence_classification_pipeline.py
@@ -0,0 +1,35 @@
+from typing import Union
+
+from ...metainfo import Pipelines
+from ...models import Model
+from ...preprocessors import (Preprocessor,
+                              SingleSentenceClassificationPreprocessor)
+from ...utils.constant import Tasks
+from ..builder import PIPELINES
+from .sequence_classification_pipeline_base import \
+    SequenceClassificationPipelineBase
+
+__all__ = ['SingleSentenceClassificationPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.sentiment_classification,
+    module_name=Pipelines.sentiment_classification)
+class SingleSentenceClassificationPipeline(SequenceClassificationPipelineBase):
+
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: Preprocessor = None,
+                 first_sequence='first_sequence',
+                 **kwargs):
+        """use `model` and `preprocessor` to create a nlp single sentence classification pipeline for prediction
+
+        Args:
+            model (Model): a model instance
+            preprocessor (Preprocessor): a preprocessor instance
+        """
+        if preprocessor is None:
+            preprocessor = SingleSentenceClassificationPreprocessor(
+                model.model_dir if isinstance(model, Model) else model,
+                first_sequence=first_sequence)
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
diff --git a/modelscope/pipelines/nlp/text_generation_pipeline.py b/modelscope/pipelines/nlp/text_generation_pipeline.py
index 85a81eba..287c98ff 100644
--- a/modelscope/pipelines/nlp/text_generation_pipeline.py
+++ b/modelscope/pipelines/nlp/text_generation_pipeline.py
@@ -3,7 +3,7 @@ from typing import Any, Dict, Optional, Union
 import torch
 
 from modelscope.metainfo import Pipelines
-from modelscope.models.base import TorchModel
+from modelscope.models.base import Model
 from modelscope.pipelines.base import Pipeline, Tensor
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import TextGenerationPreprocessor
@@ -17,7 +17,7 @@ __all__ = ['TextGenerationPipeline']
 class TextGenerationPipeline(Pipeline):
 
     def __init__(self,
-                 model: Union[TorchModel, str],
+                 model: Union[Model, str],
                  preprocessor: Optional[TextGenerationPreprocessor] = None,
                  **kwargs):
         """use `model` and `preprocessor` to create a nlp text generation pipeline for prediction
@@ -26,8 +26,8 @@ class TextGenerationPipeline(Pipeline):
             model (PalmForTextGeneration): a model instance
             preprocessor (TextGenerationPreprocessor): a preprocessor instance
         """
-        model = model if isinstance(
-            model, TorchModel) else TorchModel.from_pretrained(model)
+        model = model if isinstance(model,
+                                    Model) else Model.from_pretrained(model)
         if preprocessor is None:
             preprocessor = TextGenerationPreprocessor(
                 model.model_dir,
diff --git a/modelscope/pipelines/nlp/translation_pipeline.py b/modelscope/pipelines/nlp/translation_pipeline.py
index fdf9be64..dba3fe9f 100644
--- a/modelscope/pipelines/nlp/translation_pipeline.py
+++ b/modelscope/pipelines/nlp/translation_pipeline.py
@@ -4,11 +4,9 @@ from typing import Any, Dict
 import numpy as np
 import tensorflow as tf
 
-from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.metainfo import Pipelines
-from modelscope.models.nlp import CsanmtForTranslation
 from modelscope.outputs import OutputKeys
-from modelscope.pipelines.base import Pipeline, Tensor
+from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.logger import get_logger
diff --git a/modelscope/pipelines/nlp/word_segmentation_pipeline.py b/modelscope/pipelines/nlp/word_segmentation_pipeline.py
index 73d0c278..06e6a31c 100644
--- a/modelscope/pipelines/nlp/word_segmentation_pipeline.py
+++ b/modelscope/pipelines/nlp/word_segmentation_pipeline.py
@@ -4,11 +4,11 @@ import torch
 
 from modelscope.metainfo import Pipelines
 from modelscope.models import Model
-from modelscope.models.nlp import SbertForTokenClassification
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline, Tensor
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import TokenClassificationPreprocessor
+from modelscope.preprocessors import (Preprocessor,
+                                      TokenClassificationPreprocessor)
 from modelscope.utils.constant import Tasks
 
 __all__ = ['WordSegmentationPipeline']
@@ -18,33 +18,35 @@ __all__ = ['WordSegmentationPipeline']
     Tasks.word_segmentation, module_name=Pipelines.word_segmentation)
 class WordSegmentationPipeline(Pipeline):
 
-    def __init__(
-            self,
-            model: Union[SbertForTokenClassification, str],
-            preprocessor: Optional[TokenClassificationPreprocessor] = None,
-            **kwargs):
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: Optional[Preprocessor] = None,
+                 **kwargs):
         """use `model` and `preprocessor` to create a nlp word segmentation pipeline for prediction
 
         Args:
-            model (StructBertForTokenClassification): a model instance
-            preprocessor (TokenClassificationPreprocessor): a preprocessor instance
+            model (Model): a model instance
+            preprocessor (Preprocessor): a preprocessor instance
         """
-        model = model if isinstance(
-            model,
-            SbertForTokenClassification) else Model.from_pretrained(model)
+        model = model if isinstance(model,
+                                    Model) else Model.from_pretrained(model)
         if preprocessor is None:
             preprocessor = TokenClassificationPreprocessor(model.model_dir)
         model.eval()
         super().__init__(model=model, preprocessor=preprocessor, **kwargs)
-        self.tokenizer = preprocessor.tokenizer
-        self.config = model.config
-        assert len(self.config.id2label) > 0
-        self.id2label = self.config.id2label
+        self.id2label = kwargs.get('id2label')
+        if self.id2label is None and hasattr(self.preprocessor, 'id2label'):
+            self.id2label = self.preprocessor.id2label
+        assert self.id2label is not None, 'Cannot convert id to the original label, please pass in the mapping ' \
+                                          'as a parameter or make sure the preprocessor has the attribute.'
 
     def forward(self, inputs: Dict[str, Any],
                 **forward_params) -> Dict[str, Any]:
+        text = inputs.pop(OutputKeys.TEXT)
         with torch.no_grad():
-            return super().forward(inputs, **forward_params)
+            return {
+                **self.model(inputs, **forward_params), OutputKeys.TEXT: text
+            }
 
     def postprocess(self, inputs: Dict[str, Any],
                     **postprocess_params) -> Dict[str, str]:
diff --git a/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py b/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
index 642d4870..d0dd2336 100644
--- a/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
@@ -5,11 +5,11 @@ from scipy.special import softmax
 
 from modelscope.metainfo import Pipelines
 from modelscope.models import Model
-from modelscope.models.nlp import SbertForZeroShotClassification
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import ZeroShotClassificationPreprocessor
+from modelscope.preprocessors import (Preprocessor,
+                                      ZeroShotClassificationPreprocessor)
 from modelscope.utils.constant import Tasks
 
 __all__ = ['ZeroShotClassificationPipeline']
@@ -21,19 +21,18 @@ __all__ = ['ZeroShotClassificationPipeline']
 class ZeroShotClassificationPipeline(Pipeline):
 
     def __init__(self,
-                 model: Union[SbertForZeroShotClassification, str],
-                 preprocessor: ZeroShotClassificationPreprocessor = None,
+                 model: Union[Model, str],
+                 preprocessor: Preprocessor = None,
                  **kwargs):
-        """use `model` and `preprocessor` to create a nlp text classification pipeline for prediction
+        """use `model` and `preprocessor` to create a nlp zero-shot text classification pipeline for prediction
         Args:
-            model (SbertForZeroShotClassification): a model instance
-            preprocessor (SentimentClassificationPreprocessor): a preprocessor instance
+            model (Model): a model instance
+            preprocessor (Preprocessor): a preprocessor instance
         """
-        assert isinstance(model, str) or isinstance(model, SbertForZeroShotClassification), \
-            'model must be a single str or SbertForZeroShotClassification'
-        model = model if isinstance(
-            model,
-            SbertForZeroShotClassification) else Model.from_pretrained(model)
+        assert isinstance(model, str) or isinstance(model, Model), \
+            'model must be a single str or Model'
+        model = model if isinstance(model,
+                                    Model) else Model.from_pretrained(model)
         self.entailment_id = 0
         self.contradiction_id = 2
         if preprocessor is None:
@@ -58,7 +57,7 @@ class ZeroShotClassificationPipeline(Pipeline):
     def forward(self, inputs: Dict[str, Any],
                 **forward_params) -> Dict[str, Any]:
         with torch.no_grad():
-            return super().forward(inputs, **forward_params)
+            return self.model(inputs, **forward_params)
 
     def postprocess(self,
                     inputs: Dict[str, Any],
@@ -70,7 +69,7 @@ class ZeroShotClassificationPipeline(Pipeline):
         Returns:
             Dict[str, Any]: the prediction results
         """
-        logits = inputs['logits']
+        logits = inputs[OutputKeys.LOGITS]
         if multi_label or len(candidate_labels) == 1:
             logits = logits[..., [self.contradiction_id, self.entailment_id]]
             scores = softmax(logits, axis=-1)[..., 1]
diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py
index 9d991146..c73a6c4f 100644
--- a/modelscope/preprocessors/__init__.py
+++ b/modelscope/preprocessors/__init__.py
@@ -18,11 +18,11 @@ if TYPE_CHECKING:
                               MPlugVisualQuestionAnsweringPreprocessor)
     from .nlp import (Tokenize, SequenceClassificationPreprocessor,
                       TextGenerationPreprocessor,
-                      TokenClassificationPreprocessor, NLIPreprocessor,
-                      SentimentClassificationPreprocessor,
-                      SentenceSimilarityPreprocessor, FillMaskPreprocessor,
-                      ZeroShotClassificationPreprocessor, NERPreprocessor,
-                      TextErrorCorrectionPreprocessor)
+                      TokenClassificationPreprocessor,
+                      SingleSentenceClassificationPreprocessor,
+                      PairSentenceClassificationPreprocessor,
+                      FillMaskPreprocessor, ZeroShotClassificationPreprocessor,
+                      NERPreprocessor, TextErrorCorrectionPreprocessor)
     from .space import (DialogIntentPredictionPreprocessor,
                         DialogModelingPreprocessor,
                         DialogStateTrackingPreprocessor)
@@ -46,8 +46,8 @@ else:
         'nlp': [
             'Tokenize', 'SequenceClassificationPreprocessor',
             'TextGenerationPreprocessor', 'TokenClassificationPreprocessor',
-            'NLIPreprocessor', 'SentimentClassificationPreprocessor',
-            'SentenceSimilarityPreprocessor', 'FillMaskPreprocessor',
+            'SingleSentenceClassificationPreprocessor',
+            'PairSentenceClassificationPreprocessor', 'FillMaskPreprocessor',
             'ZeroShotClassificationPreprocessor', 'NERPreprocessor',
             'TextErrorCorrectionPreprocessor'
         ],
diff --git a/modelscope/preprocessors/base.py b/modelscope/preprocessors/base.py
index d0142693..6360a907 100644
--- a/modelscope/preprocessors/base.py
+++ b/modelscope/preprocessors/base.py
@@ -1,5 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-
+import os
 from abc import ABC, abstractmethod
 from typing import Any, Dict
 
@@ -10,6 +10,8 @@ class Preprocessor(ABC):
 
     def __init__(self, *args, **kwargs):
         self._mode = ModeKeys.INFERENCE
+        self.device = int(
+            os.environ['LOCAL_RANK']) if 'LOCAL_RANK' in os.environ else None
         pass
 
     @abstractmethod
diff --git a/modelscope/preprocessors/nlp.py b/modelscope/preprocessors/nlp.py
index a0a7a5b5..f0951f38 100644
--- a/modelscope/preprocessors/nlp.py
+++ b/modelscope/preprocessors/nlp.py
@@ -2,14 +2,14 @@
 
 import os.path as osp
 import uuid
-from typing import Any, Dict, Optional, Union
+from typing import Any, Dict, Iterable, Optional, Tuple, Union
 
 from transformers import AutoTokenizer
 
-from modelscope.metainfo import Preprocessors
-from modelscope.models import Model
+from modelscope.metainfo import Models, Preprocessors
+from modelscope.outputs import OutputKeys
 from modelscope.utils.constant import Fields, InputFields, ModeKeys
-from modelscope.utils.hub import parse_label_mapping
+from modelscope.utils.hub import get_model_type, parse_label_mapping
 from modelscope.utils.type_assert import type_assert
 from .base import Preprocessor
 from .builder import PREPROCESSORS
@@ -17,8 +17,8 @@ from .builder import PREPROCESSORS
 __all__ = [
     'Tokenize', 'SequenceClassificationPreprocessor',
     'TextGenerationPreprocessor', 'TokenClassificationPreprocessor',
-    'NLIPreprocessor', 'SentimentClassificationPreprocessor',
-    'FillMaskPreprocessor', 'SentenceSimilarityPreprocessor',
+    'PairSentenceClassificationPreprocessor',
+    'SingleSentenceClassificationPreprocessor', 'FillMaskPreprocessor',
     'ZeroShotClassificationPreprocessor', 'NERPreprocessor',
     'TextErrorCorrectionPreprocessor'
 ]
@@ -38,99 +38,6 @@ class Tokenize(Preprocessor):
         return data
 
 
-class NLPPreprocessorBase(Preprocessor):
-
-    def __init__(self, model_dir: str, *args, **kwargs):
-        """preprocess the data via the vocab.txt from the `model_dir` path
-
-        Args:
-            model_dir (str): model path
-        """
-
-        super().__init__(*args, **kwargs)
-        self.model_dir: str = model_dir
-        self.first_sequence: str = kwargs.pop('first_sequence',
-                                              'first_sequence')
-        self.second_sequence = kwargs.pop('second_sequence', 'second_sequence')
-        self.tokenize_kwargs = kwargs
-        self.tokenizer = self.build_tokenizer(model_dir)
-        self.label2id = parse_label_mapping(self.model_dir)
-
-    def build_tokenizer(self, model_dir):
-        from sofa import SbertTokenizer
-        return SbertTokenizer.from_pretrained(model_dir)
-
-    @type_assert(object, object)
-    def __call__(self, data: Union[str, tuple, Dict]) -> Dict[str, Any]:
-        """process the raw input data
-
-        Args:
-            data (tuple): [sentence1, sentence2]
-                sentence1 (str): a sentence
-                    Example:
-                        'you are so handsome.'
-                sentence2 (str): a sentence
-                    Example:
-                        'you are so beautiful.'
-        Returns:
-            Dict[str, Any]: the preprocessed data
-        """
-
-        text_a, text_b = None, None
-        if isinstance(data, str):
-            text_a = data
-        elif isinstance(data, tuple):
-            assert len(data) == 2
-            text_a, text_b = data
-        elif isinstance(data, dict):
-            text_a = data.get(self.first_sequence)
-            text_b = data.get(self.second_sequence, None)
-
-        rst = self.tokenizer(text_a, text_b, **self.tokenize_kwargs)
-        if self._mode == ModeKeys.TRAIN:
-            rst = {k: v.squeeze() for k, v in rst.items()}
-            if self.label2id is not None and 'label' in data:
-                rst['label'] = self.label2id[str(data['label'])]
-        return rst
-
-
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.nli_tokenizer)
-class NLIPreprocessor(NLPPreprocessorBase):
-
-    def __init__(self, model_dir: str, *args, **kwargs):
-        kwargs['truncation'] = True
-        kwargs['padding'] = False
-        kwargs['return_tensors'] = 'pt'
-        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
-        super().__init__(model_dir, *args, **kwargs)
-
-
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.sen_cls_tokenizer)
-class SentimentClassificationPreprocessor(NLPPreprocessorBase):
-
-    def __init__(self, model_dir: str, *args, **kwargs):
-        kwargs['truncation'] = True
-        kwargs['padding'] = 'max_length'
-        kwargs['return_tensors'] = 'pt'
-        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
-        super().__init__(model_dir, *args, **kwargs)
-
-
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.sen_sim_tokenizer)
-class SentenceSimilarityPreprocessor(NLPPreprocessorBase):
-
-    def __init__(self, model_dir: str, *args, **kwargs):
-        kwargs['truncation'] = True
-        kwargs['padding'] = False if 'padding' not in kwargs else kwargs[
-            'padding']
-        kwargs['return_tensors'] = 'pt'
-        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
-        super().__init__(model_dir, *args, **kwargs)
-
-
 @PREPROCESSORS.register_module(
     Fields.nlp, module_name=Preprocessors.bert_seq_cls_tokenizer)
 class SequenceClassificationPreprocessor(Preprocessor):
@@ -197,32 +104,193 @@ class SequenceClassificationPreprocessor(Preprocessor):
         return rst
 
 
+class NLPTokenizerPreprocessorBase(Preprocessor):
+
+    def __init__(self, model_dir: str, pair: bool, mode: str, **kwargs):
+        """preprocess the data via the vocab.txt from the `model_dir` path
+
+        Args:
+            model_dir (str): model path
+        """
+
+        super().__init__(**kwargs)
+        self.model_dir: str = model_dir
+        self.first_sequence: str = kwargs.pop('first_sequence',
+                                              'first_sequence')
+        self.second_sequence = kwargs.pop('second_sequence', 'second_sequence')
+        self.pair = pair
+        self._mode = mode
+        self.label = kwargs.pop('label', OutputKeys.LABEL)
+        self.label2id = None
+        if 'label2id' in kwargs:
+            self.label2id = kwargs.pop('label2id')
+        if self.label2id is None:
+            self.label2id = parse_label_mapping(self.model_dir)
+
+        self.tokenize_kwargs = kwargs
+        self.tokenizer = self.build_tokenizer(model_dir)
+
+    @property
+    def id2label(self):
+        if self.label2id is not None:
+            return {id: label for label, id in self.label2id.items()}
+        return None
+
+    def build_tokenizer(self, model_dir):
+        model_type = get_model_type(model_dir)
+        if model_type in (Models.structbert, Models.gpt3, Models.palm):
+            from modelscope.models.nlp.structbert import SbertTokenizerFast
+            return SbertTokenizerFast.from_pretrained(model_dir)
+        elif model_type == Models.veco:
+            from modelscope.models.nlp.veco import VecoTokenizerFast
+            return VecoTokenizerFast.from_pretrained(model_dir)
+        else:
+            return AutoTokenizer.from_pretrained(model_dir)
+
+    def __call__(self, data: Union[str, Tuple, Dict]) -> Dict[str, Any]:
+        """process the raw input data
+
+        Args:
+            data (tuple): [sentence1, sentence2]
+                sentence1 (str): a sentence
+                    Example:
+                        'you are so handsome.'
+                sentence2 (str): a sentence
+                    Example:
+                        'you are so beautiful.'
+        Returns:
+            Dict[str, Any]: the preprocessed data
+        """
+
+        text_a, text_b, labels = self.parse_text_and_label(data)
+        output = self.tokenizer(
+            text_a,
+            text_b,
+            return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None,
+            **self.tokenize_kwargs)
+        self.labels_to_id(labels, output)
+        return output
+
+    def parse_text_and_label(self, data):
+        text_a, text_b, labels = None, None, None
+        if isinstance(data, str):
+            text_a = data
+        elif isinstance(data, tuple) or isinstance(data, list):
+            if len(data) == 3:
+                text_a, text_b, labels = data
+            elif len(data) == 2:
+                if self.pair:
+                    text_a, text_b = data
+                else:
+                    text_a, labels = data
+        elif isinstance(data, dict):
+            text_a = data.get(self.first_sequence)
+            text_b = data.get(self.second_sequence)
+            labels = data.get(self.label)
+
+        return text_a, text_b, labels
+
+    def labels_to_id(self, labels, output):
+
+        def label_can_be_mapped(label):
+            return isinstance(label, str) or isinstance(label, int)
+
+        if labels is not None:
+            if isinstance(labels, Iterable) and all([label_can_be_mapped(label) for label in labels]) \
+                    and self.label2id is not None:
+                output[OutputKeys.LABEL] = [
+                    self.label2id[str(label)] for label in labels
+                ]
+            elif label_can_be_mapped(labels) and self.label2id is not None:
+                output[OutputKeys.LABEL] = self.label2id[str(labels)]
+            else:
+                output[OutputKeys.LABEL] = labels
+
+
 @PREPROCESSORS.register_module(
-    Fields.nlp, module_name='bert-seq-cls-tokenizer-finetune')
-class SentenceSimilarityFinetunePreprocessor(SentenceSimilarityPreprocessor):
-    """Sentence similarity preprocessor in the finetune scenario
+    Fields.nlp, module_name=Preprocessors.nli_tokenizer)
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.sen_sim_tokenizer)
+class PairSentenceClassificationPreprocessor(NLPTokenizerPreprocessorBase):
+
+    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
+        kwargs['truncation'] = kwargs.get('truncation', True)
+        kwargs['padding'] = kwargs.get(
+            'padding', False if mode == 'inference' else 'max_length')
+        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
+        super().__init__(model_dir, pair=True, mode=mode, **kwargs)
 
-    Mainly added the label mapping procedure.
-    """
 
-    def __init__(self, model_dir: str, *args, **kwargs):
-        kwargs['padding'] = 'max_length'
-        super().__init__(model_dir, *args, **kwargs)
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.sen_cls_tokenizer)
+class SingleSentenceClassificationPreprocessor(NLPTokenizerPreprocessorBase):
+
+    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
+        kwargs['truncation'] = kwargs.get('truncation', True)
+        kwargs['padding'] = kwargs.get(
+            'padding', False if mode == 'inference' else 'max_length')
+        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
+        super().__init__(model_dir, pair=False, mode=mode, **kwargs)
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.zero_shot_cls_tokenizer)
+class ZeroShotClassificationPreprocessor(NLPTokenizerPreprocessorBase):
+
+    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
+        """preprocess the data via the vocab.txt from the `model_dir` path
+
+        Args:
+            model_dir (str): model path
+        """
+        self.sequence_length = kwargs.pop('sequence_length', 512)
+        super().__init__(model_dir, pair=False, mode=mode, **kwargs)
+
+    def __call__(self, data: Union[str, Dict], hypothesis_template: str,
+                 candidate_labels: list) -> Dict[str, Any]:
+        """process the raw input data
+
+        Args:
+            data (str or dict): a sentence
+                Example:
+                    'you are so handsome.'
+
+        Returns:
+            Dict[str, Any]: the preprocessed data
+        """
+        if isinstance(data, dict):
+            data = data.get(self.first_sequence)
+
+        pairs = [[data, hypothesis_template.format(label)]
+                 for label in candidate_labels]
+
+        features = self.tokenizer(
+            pairs,
+            padding=True,
+            truncation=True,
+            max_length=self.sequence_length,
+            truncation_strategy='only_first',
+            return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None)
+        return features
 
 
 @PREPROCESSORS.register_module(
     Fields.nlp, module_name=Preprocessors.text_gen_tokenizer)
-class TextGenerationPreprocessor(NLPPreprocessorBase):
+class TextGenerationPreprocessor(NLPTokenizerPreprocessorBase):
 
-    def __init__(self, model_dir: str, tokenizer=None, *args, **kwargs):
+    def __init__(self,
+                 model_dir: str,
+                 tokenizer=None,
+                 mode=ModeKeys.INFERENCE,
+                 **kwargs):
         self.tokenizer = self.build_tokenizer(
             model_dir) if tokenizer is None else tokenizer
-        kwargs['truncation'] = True
-        kwargs['padding'] = True
-        kwargs['return_tensors'] = 'pt'
-        kwargs['return_token_type_ids'] = False
+        kwargs['truncation'] = kwargs.get('truncation', True)
+        kwargs['padding'] = kwargs.get('padding', True)
+        kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids',
+                                                     False)
         kwargs['max_length'] = kwargs.pop('sequence_length', 128)
-        super().__init__(model_dir, *args, **kwargs)
+        super().__init__(model_dir, pair=False, mode=mode, **kwargs)
 
     @staticmethod
     def get_roberta_tokenizer_dir(model_dir: str) -> Optional[str]:
@@ -240,19 +308,13 @@ class TextGenerationPreprocessor(NLPPreprocessorBase):
                 roberta_tokenizer_dir, do_lower_case=False)
         return super().build_tokenizer(model_dir)
 
-
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name='palm-text-gen-tokenizer-finetune')
-class TextGenerationFinetunePreprocessor(TextGenerationPreprocessor):
-
-    @type_assert(object, dict)
-    def __call__(self, data: dict) -> Dict[str, Any]:
+    def __call__(self, data: Union[Dict, str]) -> Dict[str, Any]:
+        if self._mode == 'inference':
+            return super().__call__(data)
         src_txt = data['src_txt']
         tgt_txt = data['tgt_txt']
         src_rst = super().__call__(src_txt)
         tgt_rst = super().__call__(tgt_txt)
-        src_rst = {k: v.squeeze() for k, v in src_rst.items()}
-        tgt_rst = {k: v.squeeze() for k, v in tgt_rst.items()}
 
         return {
             'src': src_rst['input_ids'],
@@ -261,87 +323,69 @@ class TextGenerationFinetunePreprocessor(TextGenerationPreprocessor):
         }
 
 
-@PREPROCESSORS.register_module(Fields.nlp)
-class FillMaskPreprocessor(NLPPreprocessorBase):
+@PREPROCESSORS.register_module(Fields.nlp, module_name=Preprocessors.fill_mask)
+class FillMaskPreprocessor(NLPTokenizerPreprocessorBase):
 
-    def __init__(self, model_dir: str, *args, **kwargs):
-        kwargs['truncation'] = True
-        kwargs['padding'] = 'max_length'
-        kwargs['return_tensors'] = 'pt'
+    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
+        kwargs['truncation'] = kwargs.get('truncation', True)
+        kwargs['padding'] = kwargs.get('padding', 'max_length')
         kwargs['max_length'] = kwargs.pop('sequence_length', 128)
-        kwargs['return_token_type_ids'] = True
-        super().__init__(model_dir, *args, **kwargs)
-
-    def build_tokenizer(self, model_dir):
-        from modelscope.utils.hub import get_model_type
-        model_type = get_model_type(model_dir)
-        if model_type in ['sbert', 'structbert', 'bert']:
-            from sofa import SbertTokenizer
-            return SbertTokenizer.from_pretrained(model_dir, use_fast=False)
-        elif model_type == 'veco':
-            from sofa import VecoTokenizer
-            return VecoTokenizer.from_pretrained(model_dir, use_fast=False)
-        else:
-            # TODO Only support veco & sbert
-            raise RuntimeError(f'Unsupported model type: {model_type}')
+        kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids',
+                                                     True)
+        super().__init__(model_dir, pair=False, mode=mode, **kwargs)
 
 
 @PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.token_cls_tokenizer)
-class TokenClassificationPreprocessor(NLPPreprocessorBase):
-
-    def __init__(self, model_dir: str, *args, **kwargs):
-        super().__init__(model_dir, *args, **kwargs)
-
-    @type_assert(object, str)
-    def __call__(self, data: Union[str, Dict]) -> Dict[str, Any]:
-        """process the raw input data
+    Fields.nlp,
+    module_name=Preprocessors.word_segment_text_to_label_preprocessor)
+class WordSegmentationBlankSetToLabelPreprocessor(Preprocessor):
 
-        Args:
-            data (str): a sentence
-                Example:
-                    'you are so handsome.'
-
-        Returns:
-            Dict[str, Any]: the preprocessed data
-        """
-
-        # preprocess the data for the model input
-        if isinstance(data, dict):
-            data = data[self.first_sequence]
-        text = data.replace(' ', '').strip()
-        tokens = []
-        for token in text:
-            token = self.tokenizer.tokenize(token)
-            tokens.extend(token)
-        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
-        input_ids = self.tokenizer.build_inputs_with_special_tokens(input_ids)
-        attention_mask = [1] * len(input_ids)
-        token_type_ids = [0] * len(input_ids)
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.first_sequence: str = kwargs.pop('first_sequence',
+                                              'first_sequence')
+        self.label = kwargs.pop('label', OutputKeys.LABELS)
+
+    def __call__(self, data: str) -> Union[Dict[str, Any], Tuple]:
+        data = data.split(' ')
+        data = list(filter(lambda x: len(x) > 0, data))
+
+        def produce_train_sample(words):
+            chars = []
+            labels = []
+            for word in words:
+                chars.extend(list(word))
+                if len(word) == 1:
+                    labels.append('S-CWS')
+                else:
+                    labels.extend(['B-CWS'] + ['I-CWS'] * (len(word) - 2)
+                                  + ['E-CWS'])
+            assert len(chars) == len(labels)
+            return chars, labels
+
+        chars, labels = produce_train_sample(data)
         return {
-            'text': text,
-            'input_ids': input_ids,
-            'attention_mask': attention_mask,
-            'token_type_ids': token_type_ids
+            self.first_sequence: chars,
+            self.label: labels,
         }
 
 
 @PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.zero_shot_cls_tokenizer)
-class ZeroShotClassificationPreprocessor(NLPPreprocessorBase):
-
-    def __init__(self, model_dir: str, *args, **kwargs):
-        """preprocess the data via the vocab.txt from the `model_dir` path
+    Fields.nlp, module_name=Preprocessors.token_cls_tokenizer)
+class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase):
 
-        Args:
-            model_dir (str): model path
-        """
-        self.sequence_length = kwargs.pop('sequence_length', 512)
-        super().__init__(model_dir, *args, **kwargs)
+    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
+        kwargs['truncation'] = kwargs.get('truncation', True)
+        kwargs['padding'] = kwargs.get(
+            'padding', False if mode == ModeKeys.INFERENCE else 'max_length')
+        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
+        kwargs['is_split_into_words'] = kwargs.pop(
+            'is_split_into_words',
+            False if mode == ModeKeys.INFERENCE else True)
+        self.label_all_tokens = kwargs.pop('label_all_tokens', False)
+        super().__init__(model_dir, pair=False, mode=mode, **kwargs)
 
-    @type_assert(object, str)
-    def __call__(self, data, hypothesis_template: str,
-                 candidate_labels: list) -> Dict[str, Any]:
+    def __call__(self, data: Union[str, Dict]) -> Dict[str, Any]:
         """process the raw input data
 
         Args:
@@ -352,20 +396,74 @@ class ZeroShotClassificationPreprocessor(NLPPreprocessorBase):
         Returns:
             Dict[str, Any]: the preprocessed data
         """
-        if isinstance(data, dict):
-            data = data.get(self.first_sequence)
 
-        pairs = [[data, hypothesis_template.format(label)]
-                 for label in candidate_labels]
-
-        features = self.tokenizer(
-            pairs,
-            padding=True,
-            truncation=True,
-            max_length=self.sequence_length,
-            return_tensors='pt',
-            truncation_strategy='only_first')
-        return features
+        # preprocess the data for the model input
+        # if isinstance(data, dict):
+        #     data = data[self.first_sequence]
+        # text = data.replace(' ', '').strip()
+        # tokens = []
+        # for token in text:
+        #     token = self.tokenizer.tokenize(token)
+        #     tokens.extend(token)
+        # input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
+        # input_ids = self.tokenizer.build_inputs_with_special_tokens(input_ids)
+        # attention_mask = [1] * len(input_ids)
+        # token_type_ids = [0] * len(input_ids)
+
+        # new code to deal with labels
+        # tokenized_inputs = self.tokenizer(data, truncation=True, is_split_into_words=True)
+
+        text_a = None
+        labels_list = None
+        if isinstance(data, str):
+            text_a = data
+        elif isinstance(data, dict):
+            text_a = data.get(self.first_sequence)
+            labels_list = data.get(self.label)
+        tokenized_inputs = self.tokenizer(
+            text_a,
+            return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None,
+            **self.tokenize_kwargs)
+
+        if labels_list is not None:
+            assert self.label2id is not None
+            # Map that sends B-Xxx label to its I-Xxx counterpart
+            b_to_i_label = []
+            label_enumerate_values = [
+                k for k, v in sorted(
+                    self.label2id.items(), key=lambda item: item[1])
+            ]
+            for idx, label in enumerate(label_enumerate_values):
+                if label.startswith('B-') and label.replace(
+                        'B-', 'I-') in label_enumerate_values:
+                    b_to_i_label.append(
+                        label_enumerate_values.index(
+                            label.replace('B-', 'I-')))
+                else:
+                    b_to_i_label.append(idx)
+
+            label_row = [self.label2id[lb] for lb in labels_list]
+            word_ids = tokenized_inputs.word_ids()
+            previous_word_idx = None
+            label_ids = []
+            for word_idx in word_ids:
+                if word_idx is None:
+                    label_ids.append(-100)
+                elif word_idx != previous_word_idx:
+                    label_ids.append(label_row[word_idx])
+                else:
+                    if self.label_all_tokens:
+                        label_ids.append(b_to_i_label[label_row[word_idx]])
+                    else:
+                        label_ids.append(-100)
+                previous_word_idx = word_idx
+            labels = label_ids
+            tokenized_inputs['labels'] = labels
+            # new code end
+
+        if self._mode == ModeKeys.INFERENCE:
+            tokenized_inputs[OutputKeys.TEXT] = text_a
+        return tokenized_inputs
 
 
 @PREPROCESSORS.register_module(
diff --git a/modelscope/preprocessors/space/dialog_state_tracking_preprocessor.py b/modelscope/preprocessors/space/dialog_state_tracking_preprocessor.py
index 80036ed1..038ab09b 100644
--- a/modelscope/preprocessors/space/dialog_state_tracking_preprocessor.py
+++ b/modelscope/preprocessors/space/dialog_state_tracking_preprocessor.py
@@ -24,7 +24,7 @@ class DialogStateTrackingPreprocessor(Preprocessor):
         """
         super().__init__(*args, **kwargs)
 
-        from sofa.models.space import SpaceConfig, SpaceTokenizer
+        from modelscope.models.nlp.space import SpaceConfig, SpaceTokenizer
         self.model_dir: str = model_dir
         self.config = SpaceConfig.from_pretrained(self.model_dir)
         self.tokenizer = SpaceTokenizer.from_pretrained(self.model_dir)
diff --git a/modelscope/task_datasets/__init__.py b/modelscope/task_datasets/__init__.py
index 5f0d9b1e..93e01cb5 100644
--- a/modelscope/task_datasets/__init__.py
+++ b/modelscope/task_datasets/__init__.py
@@ -7,12 +7,14 @@ if TYPE_CHECKING:
     from .base import TaskDataset
     from .builder import TASK_DATASETS, build_task_dataset
     from .torch_base_dataset import TorchTaskDataset
+    from .veco_dataset import VecoDataset
 
 else:
     _import_structure = {
         'base': ['TaskDataset'],
         'builder': ['TASK_DATASETS', 'build_task_dataset'],
         'torch_base_dataset': ['TorchTaskDataset'],
+        'veco_dataset': ['VecoDataset'],
     }
     import sys
 
diff --git a/modelscope/task_datasets/base.py b/modelscope/task_datasets/base.py
index a4104ced..39b791b1 100644
--- a/modelscope/task_datasets/base.py
+++ b/modelscope/task_datasets/base.py
@@ -1,6 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from abc import ABC, abstractmethod
-from typing import Any, List, Tuple
+from typing import Any, List, Tuple, Union
 
 
 class TaskDataset(ABC):
@@ -8,7 +8,7 @@ class TaskDataset(ABC):
     """
 
     def __init__(self,
-                 datasets: Tuple[Any, List[Any]],
+                 datasets: Union[Any, List[Any]],
                  mode,
                  preprocessor=None,
                  **kwargs):
@@ -18,7 +18,7 @@ class TaskDataset(ABC):
         self._inner_dataset = self.prepare_dataset(datasets)
 
     @abstractmethod
-    def prepare_dataset(self, datasets: Tuple[Any, List[Any]]) -> Any:
+    def prepare_dataset(self, datasets: Union[Any, List[Any]]) -> Any:
         """Prepare a dataset.
 
         User can process the input datasets in a whole dataset perspective.
diff --git a/modelscope/task_datasets/torch_base_dataset.py b/modelscope/task_datasets/torch_base_dataset.py
index 5ec9209e..014e4faa 100644
--- a/modelscope/task_datasets/torch_base_dataset.py
+++ b/modelscope/task_datasets/torch_base_dataset.py
@@ -1,5 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from typing import Any, List, Tuple
+from typing import Any, List, Tuple, Union
 
 from torch.utils.data import ConcatDataset, Dataset
 
@@ -14,7 +14,7 @@ class TorchTaskDataset(TaskDataset, Dataset):
     """
 
     def __init__(self,
-                 datasets: Tuple[Any, List[Any]],
+                 datasets: Union[Any, List[Any]],
                  mode,
                  preprocessor=None,
                  **kwargs):
@@ -26,7 +26,7 @@ class TorchTaskDataset(TaskDataset, Dataset):
     def __len__(self):
         return len(self._inner_dataset)
 
-    def prepare_dataset(self, datasets: Tuple[Any, List[Any]]) -> Any:
+    def prepare_dataset(self, datasets: Union[Any, List[Any]]) -> Any:
         """Prepare a dataset.
 
         User can process the input datasets in a whole dataset perspective.
diff --git a/modelscope/task_datasets/veco_dataset.py b/modelscope/task_datasets/veco_dataset.py
new file mode 100644
index 00000000..df7c6483
--- /dev/null
+++ b/modelscope/task_datasets/veco_dataset.py
@@ -0,0 +1,76 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, List, Union
+
+import numpy as np
+from datasets import Dataset, IterableDataset, concatenate_datasets
+
+from modelscope.metainfo import Models
+from modelscope.utils.constant import Tasks
+from .builder import TASK_DATASETS
+from .torch_base_dataset import TorchTaskDataset
+
+
+@TASK_DATASETS.register_module(module_name=Models.veco, group_key=Tasks.nli)
+class VecoDataset(TorchTaskDataset):
+
+    def __init__(self,
+                 datasets: Union[Any, List[Any]],
+                 mode,
+                 preprocessor=None,
+                 **kwargs):
+        self.seed = kwargs.get('seed', 42)
+        self.permutation = None
+        self.datasets = None
+        super().__init__(datasets, mode, preprocessor, **kwargs)
+
+    def switch_dataset(self, idx):
+        """Switch dataset in evaluation.
+
+        Veco evaluates dataset one by one.
+
+        Args:
+            idx: The index of the dataset
+        """
+        if self.mode == 'train':
+            raise ValueError(
+                'Only support switch dataset in the evaluation loop')
+        if idx >= len(self.datasets):
+            raise ValueError(
+                'Index is bigger than the number of the datasets.')
+        self._inner_dataset = self.datasets[idx]
+
+    def __getitem__(self, item):
+        if self.permutation is not None:
+            item = self.permutation[item]
+        return super().__getitem__(item)
+
+    def prepare_dataset(self, datasets: Union[Any, List[Any]]) -> Any:
+        """Compose all the datasets.
+
+        If the mode is 'train', all datasets will be mixed together, if the mode is 'eval',
+        the datasets will be kept and returns the first one.
+
+        Args:
+            datasets: The datasets to be composed.
+
+        Returns: The final dataset.
+        """
+        if not isinstance(datasets, (list, tuple)):
+            datasets = [datasets]
+        if self.mode == 'train':
+            if len(datasets) == 1:
+                return datasets[0]
+            elif all([
+                    isinstance(dataset, (Dataset, IterableDataset))
+                    for dataset in datasets
+            ]):
+                dataset = concatenate_datasets(list(datasets))
+                return dataset.shuffle(seed=self.seed)
+            else:
+                generator = np.random.default_rng(self.seed)
+                _len = sum([len(dataset) for dataset in datasets])
+                self.permutation = generator.permutation(_len)
+            return super().prepare_dataset(datasets)
+        else:
+            self.datasets = datasets
+            return self.datasets[0]
diff --git a/modelscope/trainers/__init__.py b/modelscope/trainers/__init__.py
index 350bab61..d802fd8b 100644
--- a/modelscope/trainers/__init__.py
+++ b/modelscope/trainers/__init__.py
@@ -4,4 +4,5 @@ from .cv import (ImageInstanceSegmentationTrainer,
                  ImagePortraitEnhancementTrainer)
 from .multi_modal import CLIPTrainer
 from .nlp import SequenceClassificationTrainer
+from .nlp_trainer import NlpEpochBasedTrainer, VecoTrainer
 from .trainer import EpochBasedTrainer
diff --git a/modelscope/trainers/hooks/evaluation_hook.py b/modelscope/trainers/hooks/evaluation_hook.py
index aea27f2f..80d8c03c 100644
--- a/modelscope/trainers/hooks/evaluation_hook.py
+++ b/modelscope/trainers/hooks/evaluation_hook.py
@@ -32,6 +32,7 @@ class EvaluationHook(Hook):
     def do_evaluate(self, trainer):
         """Evaluate the results."""
         eval_res = trainer.evaluate()
+        trainer.data_loader = trainer.train_dataloader
         for name, val in eval_res.items():
             trainer.log_buffer.output[name] = val
 
diff --git a/modelscope/trainers/hooks/lr_scheduler_hook.py b/modelscope/trainers/hooks/lr_scheduler_hook.py
index cf3a16e7..9a5de392 100644
--- a/modelscope/trainers/hooks/lr_scheduler_hook.py
+++ b/modelscope/trainers/hooks/lr_scheduler_hook.py
@@ -21,9 +21,6 @@ class LrSchedulerHook(Hook):
     def __init__(self, by_epoch=True, warmup=None) -> None:
         super().__init__()
         self.by_epoch = by_epoch
-        if not self.by_epoch:
-            raise ValueError('We only support ``by_epoch=True`` now!')
-
         self.warmup = warmup
         self.warmup_lr_scheduler = None
 
@@ -49,6 +46,11 @@ class LrSchedulerHook(Hook):
         return lr
 
     def before_train_iter(self, trainer):
+        if not self.by_epoch:
+            if self.warmup_lr_scheduler is not None:
+                self.warmup_lr_scheduler.step()
+            else:
+                trainer.lr_scheduler.step()
         trainer.log_buffer.output[LogKeys.LR] = self._get_log_lr(trainer)
 
     def before_train_epoch(self, trainer):
diff --git a/modelscope/trainers/nlp_trainer.py b/modelscope/trainers/nlp_trainer.py
new file mode 100644
index 00000000..c8121db6
--- /dev/null
+++ b/modelscope/trainers/nlp_trainer.py
@@ -0,0 +1,192 @@
+import os
+from typing import Callable, Dict, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from torch.utils.data import Dataset
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.metrics.builder import build_metric
+from modelscope.models.base import Model, TorchModel
+from modelscope.msdatasets import MsDataset
+from modelscope.preprocessors import Preprocessor, build_preprocessor
+from modelscope.utils.config import Config, ConfigDict
+from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, ModeKeys,
+                                       ModelFile, Tasks)
+from .base import TRAINERS
+from .trainer import EpochBasedTrainer
+
+
+@TRAINERS.register_module(module_name='NlpEpochBasedTrainer')
+class NlpEpochBasedTrainer(EpochBasedTrainer):
+
+    def __init__(
+            self,
+            model: Optional[Union[TorchModel, nn.Module, str]] = None,
+            cfg_file: Optional[str] = None,
+            cfg_modify_fn: Optional[Callable] = None,
+            arg_parse_fn: Optional[Callable] = None,
+            data_collator: Optional[Callable] = None,
+            train_dataset: Optional[Union[MsDataset, Dataset]] = None,
+            eval_dataset: Optional[Union[MsDataset, Dataset]] = None,
+            preprocessor: Optional[Preprocessor] = None,
+            optimizers: Tuple[torch.optim.Optimizer,
+                              torch.optim.lr_scheduler._LRScheduler] = (None,
+                                                                        None),
+            model_revision: Optional[str] = DEFAULT_MODEL_REVISION,
+            **kwargs):
+        """Add code to adapt with nlp models.
+
+        Args:
+            cfg_modify_fn: An input fn which is used to modify the cfg read out of the file.
+        """
+
+        if isinstance(model, str):
+            if os.path.exists(model):
+                model_dir = model if os.path.isdir(model) else os.path.dirname(
+                    model)
+            else:
+                model_dir = snapshot_download(model, revision=model_revision)
+            cfg_file = os.path.join(model_dir, ModelFile.CONFIGURATION)
+        else:
+            assert cfg_file is not None, 'Config file should not be None if model is an nn.Module class'
+            model_dir = os.path.dirname(cfg_file)
+
+        self.cfg_modify_fn = cfg_modify_fn
+        self.cfg = self.rebuild_config(Config.from_file(cfg_file))
+        try:
+            labels = self.cfg.dataset.train.labels
+        except AttributeError:
+            labels = None
+
+        self.label2id = None
+        self.num_labels = None
+        if labels is not None and len(labels) > 0:
+            self.label2id = {label: idx for idx, label in enumerate(labels)}
+            self.id2label = {idx: label for idx, label in enumerate(labels)}
+            self.num_labels = len(labels)
+
+        def build_dataset_keys(cfg):
+            if cfg is not None:
+                input_keys = {
+                    'first_sequence': getattr(cfg, 'first_sequence', None),
+                    'second_sequence': getattr(cfg, 'second_sequence', None),
+                    'label': getattr(cfg, 'label', None),
+                }
+            else:
+                input_keys = {}
+
+            return {k: v for k, v in input_keys.items() if v is not None}
+
+        self.train_keys = build_dataset_keys(
+            self.cfg.dataset.train if hasattr(self.cfg, 'dataset')
+            and hasattr(self.cfg.dataset, 'train') else None)
+        # TODO eval may has special keys, which is now not supported.
+        # because there is only one preprocessor in the trainer, and it only supports one group of keys.
+        self.eval_keys = self.train_keys
+
+        super().__init__(
+            model=model_dir,
+            cfg_file=cfg_file,
+            arg_parse_fn=arg_parse_fn,
+            data_collator=data_collator,
+            preprocessor=preprocessor,
+            optimizers=optimizers,
+            model_revision=model_revision,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            **kwargs)
+
+    def rebuild_config(self, cfg: Config):
+        if self.cfg_modify_fn is not None:
+            return self.cfg_modify_fn(cfg)
+        return cfg
+
+    def build_model(self) -> Union[nn.Module, TorchModel]:
+        """ Instantiate a pytorch model and return.
+
+        By default, we will create a model using config from configuration file. You can
+        override this method in a subclass.
+
+        """
+        model_args = {} if self.num_labels is None else {
+            'num_labels': self.num_labels
+        }
+        model = Model.from_pretrained(
+            self.model_dir, cfg_dict=self.cfg, **model_args)
+        if not isinstance(model, nn.Module) and hasattr(model, 'model'):
+            return model.model
+        elif isinstance(model, nn.Module):
+            return model
+
+    def build_preprocessor(self) -> Preprocessor:
+        """Build the preprocessor.
+
+        User can override this method to implement custom logits.
+
+        Returns: The preprocessor instance.
+
+        """
+        model_args = {} if self.label2id is None else {
+            'label2id': self.label2id
+        }
+        cfg = ConfigDict({
+            **getattr(self.cfg, 'preprocessor'),
+            'model_dir':
+            self.model_dir,
+            **model_args,
+            'mode':
+            ModeKeys.TRAIN,
+            **self.train_keys,
+        })
+        return build_preprocessor(cfg, Tasks.find_field_by_task(self.cfg.task))
+
+
+@TRAINERS.register_module(module_name='VecoTrainer')
+class VecoTrainer(NlpEpochBasedTrainer):
+
+    def evaluate(self, checkpoint_path=None):
+        """Veco evaluates the datasets one by one.
+
+        """
+        from modelscope.task_datasets import VecoDataset
+        self.model.eval()
+        self._mode = ModeKeys.EVAL
+        metric_values = {}
+
+        if self.eval_dataset is None:
+            val_data = self.cfg.dataset.val
+            self.eval_dataset = self.build_dataset(
+                val_data, mode=ModeKeys.EVAL)
+
+        idx = 0
+        dataset_cnt = 1
+        if isinstance(self.eval_dataset, VecoDataset):
+            self.eval_dataset.switch_dataset(idx)
+            dataset_cnt = len(self.eval_dataset.datasets)
+
+        while True:
+            self.eval_dataloader = self._build_dataloader_with_dataset(
+                self.eval_dataset, **self.cfg.evaluation.get('dataloader', {}))
+            self.data_loader = self.eval_dataloader
+
+            metric_classes = [
+                build_metric(metric, default_args={'trainer': self})
+                for metric in self.metrics
+            ]
+            self.evaluation_loop(self.eval_dataloader, checkpoint_path,
+                                 metric_classes)
+
+            for m_idx, metric_cls in enumerate(metric_classes):
+                if f'eval_dataset[{idx}]' not in metric_values:
+                    metric_values[f'eval_dataset[{idx}]'] = {}
+                metric_values[f'eval_dataset[{idx}]'][
+                    self.metrics[m_idx]] = metric_cls.evaluate()
+
+            idx += 1
+            if idx < dataset_cnt:
+                self.eval_dataset.switch_dataset(idx)
+            else:
+                break
+
+        return metric_values
diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py
index e83654a2..c5574f32 100644
--- a/modelscope/trainers/trainer.py
+++ b/modelscope/trainers/trainer.py
@@ -22,7 +22,8 @@ from modelscope.models.base import Model, TorchModel
 from modelscope.msdatasets.ms_dataset import MsDataset
 from modelscope.preprocessors import build_preprocessor
 from modelscope.preprocessors.base import Preprocessor
-from modelscope.task_datasets import TorchTaskDataset, build_task_dataset
+from modelscope.task_datasets.builder import build_task_dataset
+from modelscope.task_datasets.torch_base_dataset import TorchTaskDataset
 from modelscope.trainers.hooks.builder import HOOKS
 from modelscope.trainers.hooks.priority import Priority, get_priority
 from modelscope.trainers.lrscheduler.builder import build_lr_scheduler
@@ -30,12 +31,12 @@ from modelscope.trainers.optimizer.builder import build_optimizer
 from modelscope.utils.config import Config, ConfigDict
 from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, Hubs, ModeKeys,
                                        ModelFile, Tasks, TrainerStages)
+from modelscope.utils.file_utils import func_receive_dict_inputs
 from modelscope.utils.logger import get_logger
 from modelscope.utils.registry import build_from_cfg
 from modelscope.utils.tensor_utils import torch_default_data_collator
 from modelscope.utils.torch_utils import (broadcast, create_device,
                                           get_dist_info, init_dist)
-from modelscope.utils.utils import if_func_receive_dict_inputs
 from .base import BaseTrainer
 from .builder import TRAINERS
 from .default_config import DEFAULT_CONFIG
@@ -87,6 +88,7 @@ class EpochBasedTrainer(BaseTrainer):
                                                                         None),
             model_revision: Optional[str] = DEFAULT_MODEL_REVISION,
             **kwargs):
+
         if isinstance(model, str):
             if os.path.exists(model):
                 self.model_dir = model if os.path.isdir(
@@ -108,9 +110,9 @@ class EpochBasedTrainer(BaseTrainer):
             self.model = model
 
         super().__init__(cfg_file, arg_parse_fn)
-
         # add default config
         self.cfg.merge_from_dict(self._get_default_config(), force=False)
+        self.cfg = self.rebuild_config(self.cfg)
 
         if 'work_dir' in kwargs:
             self.work_dir = kwargs['work_dir']
@@ -130,9 +132,9 @@ class EpochBasedTrainer(BaseTrainer):
         self.device = create_device(device_name == 'cpu')
 
         self.train_dataset = self.to_task_dataset(
-            train_dataset, mode='train', preprocessor=self.preprocessor)
+            train_dataset, mode=ModeKeys.TRAIN, preprocessor=self.preprocessor)
         self.eval_dataset = self.to_task_dataset(
-            eval_dataset, mode='eval', preprocessor=self.preprocessor)
+            eval_dataset, mode=ModeKeys.EVAL, preprocessor=self.preprocessor)
 
         self.data_collator = data_collator if data_collator is not None else torch_default_data_collator
         self.metrics = self.get_metrics()
@@ -168,6 +170,14 @@ class EpochBasedTrainer(BaseTrainer):
             if not is_parallel(self.model) and self._dist:
                 self.model = self.to_parallel(self.model)
 
+    def rebuild_config(self, cfg: Config):
+        """A method used to rebuild the config, any subclass can override this method.
+
+        Returns: The rebuilt config
+
+        """
+        return cfg
+
     @property
     def mode(self):
         return self._mode
@@ -203,7 +213,7 @@ class EpochBasedTrainer(BaseTrainer):
         return self._max_epochs * len(self.data_loader)
 
     def to_task_dataset(self,
-                        datasets: Tuple[Dataset, List[Dataset]],
+                        datasets: Union[Dataset, List[Dataset]],
                         mode: str,
                         preprocessor: Optional[Preprocessor] = None):
         """Build the task specific dataset processor for this trainer.
@@ -229,17 +239,13 @@ class EpochBasedTrainer(BaseTrainer):
                 cfg = ConfigDict(
                     type=self.cfg.task, mode=mode, datasets=datasets)
                 return build_task_dataset(cfg, self.cfg.task)
-            elif isinstance(datasets,
-                            Dataset) or (isinstance(datasets, List)
-                                         and isinstance(datasets[0], Dataset)):
+            else:
                 cfg = ConfigDict(
-                    type=self.cfg.model.type, mode=mode, datasets=datasets)
+                    type=self.cfg.model.type,
+                    mode=mode,
+                    datasets=datasets,
+                    preprocessor=preprocessor)
                 return build_task_dataset(cfg, self.cfg.task)
-            else:
-                raise ValueError(
-                    f'invalid datasets type: {type(datasets)}, '
-                    f'expected  `MsDataset`, `torch.utils.data.Dataset` or list of them.'
-                )
         except Exception:
             if isinstance(datasets, (List, Tuple)) or preprocessor is not None:
                 return TorchTaskDataset(
@@ -262,8 +268,11 @@ class EpochBasedTrainer(BaseTrainer):
         # TODO @wenmeng.zwm @jiangnana.jnn add support for different preprocessor
         # when they are different ones in training and evaluation
         cfg = ConfigDict({
-            **getattr(self.cfg, 'preprocessor'), 'model_dir':
-            self.model_dir
+            **getattr(self.cfg, 'preprocessor'),
+            'model_dir':
+            self.model_dir,
+            'mode':
+            ModeKeys.TRAIN,
         })
         return build_preprocessor(cfg, Tasks.find_field_by_task(self.cfg.task))
 
@@ -324,6 +333,8 @@ class EpochBasedTrainer(BaseTrainer):
                 **self.cfg.evaluation.get('dataloader', {}))
         self.data_loader = self.eval_dataloader
         metric_classes = [build_metric(metric) for metric in self.metrics]
+        for m in metric_classes:
+            m.trainer = self
         metric_values = self.evaluation_loop(self.eval_dataloader,
                                              checkpoint_path, metric_classes)
 
@@ -338,10 +349,9 @@ class EpochBasedTrainer(BaseTrainer):
         """ Instantiate a pytorch model and return.
 
         By default, we will create a model using config from configuration file. You can
-        subclass and override this method in a subclass.
+        override this method in a subclass.
 
         """
-        # TODO temp implementation, waiting for @zhangzhicheng
         model = Model.from_pretrained(self.model_dir)
         if not isinstance(model, nn.Module) and hasattr(model, 'model'):
             return model.model
@@ -412,9 +422,8 @@ class EpochBasedTrainer(BaseTrainer):
         self._mode = ModeKeys.TRAIN
         inputs = self.collate_fn(inputs)
         # call model forward but not __call__ to skip postprocess
-        if isinstance(
-                inputs,
-                Mapping) and not if_func_receive_dict_inputs(model.forward):
+        if isinstance(inputs,
+                      Mapping) and not func_receive_dict_inputs(model.forward):
             train_outputs = model.forward(**inputs)
         else:
             train_outputs = model.forward(inputs)
@@ -495,7 +504,7 @@ class EpochBasedTrainer(BaseTrainer):
         if self.eval_dataset is None:
             val_data = self.cfg.dataset.val
             self.eval_dataset = self.build_dataset(
-                val_data, mode=ModeKeys.TRAIN)
+                val_data, mode=ModeKeys.EVAL)
 
         batch_size = self.cfg.evaluation.batch_size
         workers = self.cfg.evaluation.workers
@@ -523,7 +532,8 @@ class EpochBasedTrainer(BaseTrainer):
         )
         torch_dataset = dataset.to_torch_dataset(
             preprocessors=self.preprocessor, )
-        return torch_dataset
+        dataset = self.to_task_dataset(torch_dataset, mode)
+        return dataset
 
     def create_optimizer_and_scheduler(self):
         """ Create optimizer and lr scheduler
diff --git a/modelscope/trainers/utils/inference.py b/modelscope/trainers/utils/inference.py
index c30d1d15..a90a58b6 100644
--- a/modelscope/trainers/utils/inference.py
+++ b/modelscope/trainers/utils/inference.py
@@ -10,9 +10,9 @@ import torch
 from torch import distributed as dist
 from tqdm import tqdm
 
+from modelscope.utils.file_utils import func_receive_dict_inputs
 from modelscope.utils.torch_utils import (broadcast, get_dist_info, is_master,
                                           make_tmp_dir)
-from modelscope.utils.utils import if_func_receive_dict_inputs
 
 
 def single_gpu_test(model,
@@ -37,18 +37,19 @@ def single_gpu_test(model,
             if data_collate_fn is not None:
                 data = data_collate_fn(data)
             with torch.no_grad():
-                if isinstance(data,
-                              Mapping) and not if_func_receive_dict_inputs(
-                                  model.forward):
-
-                    result = model(**data)
+                if isinstance(data, Mapping) and not func_receive_dict_inputs(
+                        model.forward):
+                    result = model.forward(**data)
                 else:
-                    result = model(data)
+                    result = model.forward(data)
             if metric_classes is not None:
                 for metric_cls in metric_classes:
                     metric_cls.add(result, data)
 
-            batch_size = len(result)
+            if isinstance(data, dict):
+                batch_size = len(next(iter(data.values())))
+            else:
+                batch_size = len(data)
             for _ in range(batch_size):
                 pbar.update()
 
@@ -101,16 +102,18 @@ def multi_gpu_test(model,
                 data = data_collate_fn(data)
             data_list.append(data)
             with torch.no_grad():
-                if isinstance(data,
-                              Mapping) and not if_func_receive_dict_inputs(
-                                  model.forward):
-                    result = model(**data)
+                if isinstance(data, Mapping) and not func_receive_dict_inputs(
+                        model.forward):
+                    result = model.forward(**data)
                 else:
-                    result = model(data)
+                    result = model.forward(data)
             results.append(result)
 
             if rank == 0:
-                batch_size = len(result)
+                if isinstance(data, dict):
+                    batch_size = len(next(iter(data.values())))
+                else:
+                    batch_size = len(data)
                 batch_size_all = batch_size * world_size
                 count += batch_size_all
                 if count > len(dataset):
diff --git a/modelscope/utils/ast_utils.py b/modelscope/utils/ast_utils.py
index b7b32c81..b8ee1258 100644
--- a/modelscope/utils/ast_utils.py
+++ b/modelscope/utils/ast_utils.py
@@ -16,9 +16,9 @@ from modelscope.fileio.file import LocalStorage
 from modelscope.metainfo import (Heads, Metrics, Models, Pipelines,
                                  Preprocessors, TaskModels, Trainers)
 from modelscope.utils.constant import Fields, Tasks
+from modelscope.utils.file_utils import get_default_cache_dir
 from modelscope.utils.logger import get_logger
 from modelscope.utils.registry import default_group
-from modelscope.utils.utils import get_default_cache_dir
 
 logger = get_logger()
 storage = LocalStorage()
diff --git a/modelscope/utils/utils.py b/modelscope/utils/file_utils.py
similarity index 96%
rename from modelscope/utils/utils.py
rename to modelscope/utils/file_utils.py
index c2c47092..a04d890f 100644
--- a/modelscope/utils/utils.py
+++ b/modelscope/utils/file_utils.py
@@ -5,7 +5,7 @@ import os
 
 
 # TODO: remove this api, unify to flattened args
-def if_func_receive_dict_inputs(func):
+def func_receive_dict_inputs(func):
     """to decide if a func could recieve dict inputs or not
 
     Args:
diff --git a/modelscope/utils/hub.py b/modelscope/utils/hub.py
index 5af67944..6e5326f4 100644
--- a/modelscope/utils/hub.py
+++ b/modelscope/utils/hub.py
@@ -98,4 +98,14 @@ def parse_label_mapping(model_dir):
             label_mapping = json.load(f)
         label2id = {name: idx for name, idx in label_mapping.items()}
 
+    if label2id is None:
+        config_path = os.path.join(model_dir, ModelFile.CONFIGURATION)
+        config = Config.from_file(config_path)
+        if hasattr(config, 'model') and hasattr(config.model, 'label2id'):
+            label2id = config.model.label2id
+    if label2id is None:
+        config_path = os.path.join(model_dir, 'config.json')
+        config = Config.from_file(config_path)
+        if hasattr(config, 'label2id'):
+            label2id = config.label2id
     return label2id
diff --git a/modelscope/utils/tensor_utils.py b/modelscope/utils/tensor_utils.py
index 01b68f78..aca103d2 100644
--- a/modelscope/utils/tensor_utils.py
+++ b/modelscope/utils/tensor_utils.py
@@ -68,7 +68,7 @@ def torch_default_data_collator(features):
                          ) and v is not None and not isinstance(v, str):
                 if isinstance(v, torch.Tensor):
                     batch[k] = torch.stack([f[k] for f in features])
-                elif isinstance(v, list):
+                elif isinstance(v, list) and isinstance(v[0], torch.Tensor):
                     batch[k] = torch.stack([d for f in features for d in f[k]])
                 else:
                     batch[k] = torch.tensor(np.array([f[k] for f in features]))
diff --git a/requirements/nlp.txt b/requirements/nlp.txt
index deb6a5bd..c69174fe 100644
--- a/requirements/nlp.txt
+++ b/requirements/nlp.txt
@@ -4,5 +4,5 @@ pai-easynlp
 # rough-score was just recently updated from 0.0.4 to 0.0.7
 # which introduced compatability issues that are being investigated
 rouge_score<=0.0.4
-sofa>=1.0.5
+seqeval
 spacy>=2.3.5
diff --git a/requirements/runtime.txt b/requirements/runtime.txt
index fbf33854..5675f031 100644
--- a/requirements/runtime.txt
+++ b/requirements/runtime.txt
@@ -15,5 +15,5 @@ setuptools
 tensorboard
 tokenizers
 tqdm>=4.64.0
-transformers>=4.10.3
+transformers>=4.12.0
 yapf
diff --git a/tests/metrics/__init__.py b/tests/metrics/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/metrics/test_token_classification_metrics.py b/tests/metrics/test_token_classification_metrics.py
new file mode 100644
index 00000000..b249b227
--- /dev/null
+++ b/tests/metrics/test_token_classification_metrics.py
@@ -0,0 +1,44 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import unittest
+
+import numpy as np
+
+from modelscope.metrics.token_classification_metric import \
+    TokenClassificationMetric
+from modelscope.utils.test_utils import test_level
+
+
+class TestTokenClsMetrics(unittest.TestCase):
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_value(self):
+        metric = TokenClassificationMetric()
+
+        class Trainer:
+            pass
+
+        metric.trainer = Trainer()
+        metric.trainer.label2id = {
+            'B-obj': 0,
+            'I-obj': 1,
+            'O': 2,
+        }
+
+        outputs = {
+            'logits':
+            np.array([[[2.0, 1.0, 0.5], [1.0, 1.5, 1.0], [2.0, 1.0, 3.0],
+                       [2.4, 1.5, 4.0], [2.0, 1.0, 3.0], [2.4, 1.5, 1.7],
+                       [2.0, 1.0, 0.5], [2.4, 1.5, 0.5]]])
+        }
+        inputs = {'labels': np.array([[0, 1, 2, 2, 0, 1, 2, 2]])}
+        metric.add(outputs, inputs)
+        ret = metric.evaluate()
+        self.assertTrue(np.isclose(ret['precision'], 0.25))
+        self.assertTrue(np.isclose(ret['recall'], 0.5))
+        self.assertTrue(np.isclose(ret['accuracy'], 0.5))
+        print(ret)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/models/test_base_torch.py b/tests/models/test_base_torch.py
index dcdf79be..c147259b 100644
--- a/tests/models/test_base_torch.py
+++ b/tests/models/test_base_torch.py
@@ -21,8 +21,8 @@ class TorchBaseTest(unittest.TestCase):
                 self.conv1 = nn.Conv2d(1, 20, 5)
                 self.conv2 = nn.Conv2d(20, 20, 5)
 
-            def forward(self, x):
-                x = F.relu(self.conv1(x))
+            def forward(self, input):
+                x = F.relu(self.conv1(input))
                 return F.relu(self.conv2(x))
 
         model = MyTorchModel()
@@ -41,8 +41,8 @@ class TorchBaseTest(unittest.TestCase):
                 self.conv1 = nn.Conv2d(1, 20, 5)
                 self.conv2 = nn.Conv2d(20, 20, 5)
 
-            def forward(self, x):
-                x = F.relu(self.conv1(x))
+            def forward(self, input):
+                x = F.relu(self.conv1(input))
                 return F.relu(self.conv2(x))
 
             def postprocess(self, x):
diff --git a/tests/pipelines/test_csanmt_translation.py b/tests/pipelines/test_csanmt_translation.py
index 449b0cb7..a5c29f16 100644
--- a/tests/pipelines/test_csanmt_translation.py
+++ b/tests/pipelines/test_csanmt_translation.py
@@ -12,7 +12,7 @@ class TranslationTest(unittest.TestCase):
     model_id = 'damo/nlp_csanmt_translation'
     inputs = 'Gut@@ ach : Incre@@ ased safety for pedestri@@ ans'
 
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_model_name(self):
         pipeline_ins = pipeline(task=Tasks.translation, model=self.model_id)
         print(pipeline_ins(input=self.inputs))
diff --git a/tests/pipelines/test_fill_mask.py b/tests/pipelines/test_fill_mask.py
index b028cfbe..2f57b2d8 100644
--- a/tests/pipelines/test_fill_mask.py
+++ b/tests/pipelines/test_fill_mask.py
@@ -45,7 +45,7 @@ class FillMaskTest(unittest.TestCase):
             model_dir = snapshot_download(self.model_id_sbert[language])
             preprocessor = FillMaskPreprocessor(
                 model_dir, first_sequence='sentence', second_sequence=None)
-            model = StructBertForMaskedLM(model_dir)
+            model = StructBertForMaskedLM.from_pretrained(model_dir)
             pipeline1 = FillMaskPipeline(model, preprocessor)
             pipeline2 = pipeline(
                 Tasks.fill_mask, model=model, preprocessor=preprocessor)
@@ -60,7 +60,7 @@ class FillMaskTest(unittest.TestCase):
         model_dir = snapshot_download(self.model_id_veco)
         preprocessor = FillMaskPreprocessor(
             model_dir, first_sequence='sentence', second_sequence=None)
-        model = VecoForMaskedLM(model_dir)
+        model = VecoForMaskedLM.from_pretrained(model_dir)
         pipeline1 = FillMaskPipeline(model, preprocessor)
         pipeline2 = pipeline(
             Tasks.fill_mask, model=model, preprocessor=preprocessor)
@@ -77,7 +77,7 @@ class FillMaskTest(unittest.TestCase):
         model_dir = snapshot_download(self.model_id_bert)
         preprocessor = FillMaskPreprocessor(
             model_dir, first_sequence='sentence', second_sequence=None)
-        model = BertForMaskedLM(model_dir)
+        model = BertForMaskedLM.from_pretrained(model_dir)
         pipeline1 = FillMaskPipeline(model, preprocessor)
         pipeline2 = pipeline(
             Tasks.fill_mask, model=model, preprocessor=preprocessor)
diff --git a/tests/pipelines/test_nli.py b/tests/pipelines/test_nli.py
index 8d5d3dfa..f477fb37 100644
--- a/tests/pipelines/test_nli.py
+++ b/tests/pipelines/test_nli.py
@@ -3,10 +3,10 @@ import unittest
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
-from modelscope.models.nlp import SbertForNLI
+from modelscope.models.nlp import SbertForSequenceClassification
 from modelscope.pipelines import pipeline
-from modelscope.pipelines.nlp import NLIPipeline
-from modelscope.preprocessors import NLIPreprocessor
+from modelscope.pipelines.nlp import PairSentenceClassificationPipeline
+from modelscope.preprocessors import PairSentenceClassificationPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.test_utils import test_level
 
@@ -19,9 +19,10 @@ class NLITest(unittest.TestCase):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_direct_file_download(self):
         cache_path = snapshot_download(self.model_id)
-        tokenizer = NLIPreprocessor(cache_path)
-        model = SbertForNLI(cache_path, tokenizer=tokenizer)
-        pipeline1 = NLIPipeline(model, preprocessor=tokenizer)
+        tokenizer = PairSentenceClassificationPreprocessor(cache_path)
+        model = SbertForSequenceClassification.from_pretrained(cache_path)
+        pipeline1 = PairSentenceClassificationPipeline(
+            model, preprocessor=tokenizer)
         pipeline2 = pipeline(Tasks.nli, model=model, preprocessor=tokenizer)
         print(f'sentence1: {self.sentence1}\nsentence2: {self.sentence2}\n'
               f'pipeline1:{pipeline1(input=(self.sentence1, self.sentence2))}')
@@ -33,7 +34,7 @@ class NLITest(unittest.TestCase):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
         model = Model.from_pretrained(self.model_id)
-        tokenizer = NLIPreprocessor(model.model_dir)
+        tokenizer = PairSentenceClassificationPreprocessor(model.model_dir)
         pipeline_ins = pipeline(
             task=Tasks.nli, model=model, preprocessor=tokenizer)
         print(pipeline_ins(input=(self.sentence1, self.sentence2)))
diff --git a/tests/pipelines/test_sentence_similarity.py b/tests/pipelines/test_sentence_similarity.py
index 8cfb2c20..7a30d779 100644
--- a/tests/pipelines/test_sentence_similarity.py
+++ b/tests/pipelines/test_sentence_similarity.py
@@ -4,10 +4,10 @@ import unittest
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
-from modelscope.models.nlp import SbertForSentenceSimilarity
+from modelscope.models.nlp import SbertForSequenceClassification
 from modelscope.pipelines import pipeline
-from modelscope.pipelines.nlp import SentenceSimilarityPipeline
-from modelscope.preprocessors import SentenceSimilarityPreprocessor
+from modelscope.pipelines.nlp import PairSentenceClassificationPipeline
+from modelscope.preprocessors import PairSentenceClassificationPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.test_utils import test_level
 
@@ -20,9 +20,10 @@ class SentenceSimilarityTest(unittest.TestCase):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run(self):
         cache_path = snapshot_download(self.model_id)
-        tokenizer = SentenceSimilarityPreprocessor(cache_path)
-        model = SbertForSentenceSimilarity(cache_path, tokenizer=tokenizer)
-        pipeline1 = SentenceSimilarityPipeline(model, preprocessor=tokenizer)
+        tokenizer = PairSentenceClassificationPreprocessor(cache_path)
+        model = SbertForSequenceClassification.from_pretrained(cache_path)
+        pipeline1 = PairSentenceClassificationPipeline(
+            model, preprocessor=tokenizer)
         pipeline2 = pipeline(
             Tasks.sentence_similarity, model=model, preprocessor=tokenizer)
         print('test1')
@@ -36,7 +37,7 @@ class SentenceSimilarityTest(unittest.TestCase):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
         model = Model.from_pretrained(self.model_id)
-        tokenizer = SentenceSimilarityPreprocessor(model.model_dir)
+        tokenizer = PairSentenceClassificationPreprocessor(model.model_dir)
         pipeline_ins = pipeline(
             task=Tasks.sentence_similarity,
             model=model,
diff --git a/tests/pipelines/test_sentiment_classification.py b/tests/pipelines/test_sentiment_classification.py
index 53031e9d..82c068be 100644
--- a/tests/pipelines/test_sentiment_classification.py
+++ b/tests/pipelines/test_sentiment_classification.py
@@ -3,11 +3,10 @@ import unittest
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
-from modelscope.models.nlp import (SbertForSentimentClassification,
-                                   SequenceClassificationModel)
+from modelscope.models.nlp import SbertForSequenceClassification
 from modelscope.pipelines import pipeline
-from modelscope.pipelines.nlp import SentimentClassificationPipeline
-from modelscope.preprocessors import SentimentClassificationPreprocessor
+from modelscope.pipelines.nlp import SingleSentenceClassificationPipeline
+from modelscope.preprocessors import SingleSentenceClassificationPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.test_utils import test_level
 
@@ -19,46 +18,52 @@ class SentimentClassificationTest(unittest.TestCase):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_direct_file_download(self):
         cache_path = snapshot_download(self.model_id)
-        tokenizer = SentimentClassificationPreprocessor(cache_path)
-        model = SequenceClassificationModel.from_pretrained(
+        tokenizer = SingleSentenceClassificationPreprocessor(cache_path)
+        model = SbertForSequenceClassification.from_pretrained(
             self.model_id, num_labels=2)
-        pipeline1 = SentimentClassificationPipeline(
+        pipeline1 = SingleSentenceClassificationPipeline(
             model, preprocessor=tokenizer)
         pipeline2 = pipeline(
             Tasks.sentiment_classification,
             model=model,
-            preprocessor=tokenizer,
-            model_revision='beta')
+            preprocessor=tokenizer)
         print(f'sentence1: {self.sentence1}\n'
               f'pipeline1:{pipeline1(input=self.sentence1)}')
         print()
         print(f'sentence1: {self.sentence1}\n'
               f'pipeline1: {pipeline2(input=self.sentence1)}')
+        self.assertTrue(
+            isinstance(pipeline1.model, SbertForSequenceClassification))
+        self.assertTrue(
+            isinstance(pipeline2.model, SbertForSequenceClassification))
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
         model = Model.from_pretrained(self.model_id)
-        tokenizer = SentimentClassificationPreprocessor(model.model_dir)
+        tokenizer = SingleSentenceClassificationPreprocessor(model.model_dir)
         pipeline_ins = pipeline(
             task=Tasks.sentiment_classification,
             model=model,
-            preprocessor=tokenizer,
-            model_revision='beta')
+            preprocessor=tokenizer)
         print(pipeline_ins(input=self.sentence1))
+        self.assertTrue(
+            isinstance(pipeline_ins.model, SbertForSequenceClassification))
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_name(self):
         pipeline_ins = pipeline(
-            task=Tasks.sentiment_classification,
-            model=self.model_id,
-            model_revision='beta')
+            task=Tasks.sentiment_classification, model=self.model_id)
         print(pipeline_ins(input=self.sentence1))
+        print(pipeline_ins.model.__class__)
+        self.assertTrue(
+            isinstance(pipeline_ins.model, SbertForSequenceClassification))
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
-        pipeline_ins = pipeline(
-            task=Tasks.sentiment_classification, model_revision='beta')
+        pipeline_ins = pipeline(task=Tasks.sentiment_classification)
         print(pipeline_ins(input=self.sentence1))
+        self.assertTrue(
+            isinstance(pipeline_ins.model, SbertForSequenceClassification))
 
 
 if __name__ == '__main__':
diff --git a/tests/pipelines/test_sentiment_classification_task_model.py b/tests/pipelines/test_sentiment_classification_task_model.py
new file mode 100644
index 00000000..2808ec84
--- /dev/null
+++ b/tests/pipelines/test_sentiment_classification_task_model.py
@@ -0,0 +1,70 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.models.nlp.task_models.sequence_classification import \
+    SequenceClassificationModel
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.nlp import SingleSentenceClassificationPipeline
+from modelscope.preprocessors import SingleSentenceClassificationPreprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class SentimentClassificationTaskModelTest(unittest.TestCase):
+    model_id = 'damo/nlp_structbert_sentiment-classification_chinese-base'
+    sentence1 = '启动的时候很大声音，然后就会听到1.2秒的卡察的声音，类似齿轮摩擦的声音'
+
+    @unittest.skip
+    def test_run_with_direct_file_download(self):
+        cache_path = snapshot_download(self.model_id)
+        tokenizer = SingleSentenceClassificationPreprocessor(cache_path)
+        model = SequenceClassificationModel.from_pretrained(
+            self.model_id, num_labels=2)
+        pipeline1 = SingleSentenceClassificationPipeline(
+            model, preprocessor=tokenizer)
+        pipeline2 = pipeline(
+            Tasks.sentiment_classification,
+            model=model,
+            preprocessor=tokenizer,
+            model_revision='beta')
+        print(f'sentence1: {self.sentence1}\n'
+              f'pipeline1:{pipeline1(input=self.sentence1)}')
+        print()
+        print(f'sentence1: {self.sentence1}\n'
+              f'pipeline1: {pipeline2(input=self.sentence1)}')
+
+    @unittest.skip
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id, revision='beta')
+        tokenizer = SingleSentenceClassificationPreprocessor(model.model_dir)
+        pipeline_ins = pipeline(
+            task=Tasks.sentiment_classification,
+            model=model,
+            preprocessor=tokenizer)
+        print(pipeline_ins(input=self.sentence1))
+        self.assertTrue(
+            isinstance(pipeline_ins.model, SequenceClassificationModel))
+
+    @unittest.skip
+    def test_run_with_model_name(self):
+        pipeline_ins = pipeline(
+            task=Tasks.sentiment_classification,
+            model=self.model_id,
+            model_revision='beta')
+        print(pipeline_ins(input=self.sentence1))
+        self.assertTrue(
+            isinstance(pipeline_ins.model, SequenceClassificationModel))
+
+    @unittest.skip
+    def test_run_with_default_model(self):
+        pipeline_ins = pipeline(
+            task=Tasks.sentiment_classification, model_revision='beta')
+        print(pipeline_ins(input=self.sentence1))
+        self.assertTrue(
+            isinstance(pipeline_ins.model, SequenceClassificationModel))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_text_generation.py b/tests/pipelines/test_text_generation.py
index fd397de3..c391e0a1 100644
--- a/tests/pipelines/test_text_generation.py
+++ b/tests/pipelines/test_text_generation.py
@@ -39,7 +39,7 @@ class TextGenerationTest(unittest.TestCase):
         for model_id, input in ((self.palm_model_id_zh, self.palm_input_zh),
                                 (self.palm_model_id_en, self.palm_input_en)):
             cache_path = snapshot_download(model_id)
-            model = PalmForTextGeneration(cache_path)
+            model = PalmForTextGeneration.from_pretrained(cache_path)
             preprocessor = TextGenerationPreprocessor(
                 cache_path,
                 model.tokenizer,
diff --git a/tests/pipelines/test_word_segmentation.py b/tests/pipelines/test_word_segmentation.py
index 5e3571f7..98fab808 100644
--- a/tests/pipelines/test_word_segmentation.py
+++ b/tests/pipelines/test_word_segmentation.py
@@ -20,7 +20,7 @@ class WordSegmentationTest(unittest.TestCase):
     def test_run_by_direct_model_download(self):
         cache_path = snapshot_download(self.model_id)
         tokenizer = TokenClassificationPreprocessor(cache_path)
-        model = SbertForTokenClassification(cache_path, tokenizer=tokenizer)
+        model = SbertForTokenClassification.from_pretrained(cache_path)
         pipeline1 = WordSegmentationPipeline(model, preprocessor=tokenizer)
         pipeline2 = pipeline(
             Tasks.word_segmentation, model=model, preprocessor=tokenizer)
diff --git a/tests/pipelines/test_zero_shot_classification.py b/tests/pipelines/test_zero_shot_classification.py
index df0098f0..ee0b5bae 100644
--- a/tests/pipelines/test_zero_shot_classification.py
+++ b/tests/pipelines/test_zero_shot_classification.py
@@ -3,7 +3,7 @@ import unittest
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
-from modelscope.models.nlp import SbertForZeroShotClassification
+from modelscope.models.nlp import SbertForSequenceClassification
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import ZeroShotClassificationPipeline
 from modelscope.preprocessors import ZeroShotClassificationPreprocessor
@@ -21,7 +21,7 @@ class ZeroShotClassificationTest(unittest.TestCase):
     def test_run_with_direct_file_download(self):
         cache_path = snapshot_download(self.model_id)
         tokenizer = ZeroShotClassificationPreprocessor(cache_path)
-        model = SbertForZeroShotClassification(cache_path, tokenizer=tokenizer)
+        model = SbertForSequenceClassification.from_pretrained(cache_path)
         pipeline1 = ZeroShotClassificationPipeline(
             model, preprocessor=tokenizer)
         pipeline2 = pipeline(
diff --git a/tests/taskdataset/test_veco_dataset.py b/tests/taskdataset/test_veco_dataset.py
new file mode 100644
index 00000000..fc59750d
--- /dev/null
+++ b/tests/taskdataset/test_veco_dataset.py
@@ -0,0 +1,35 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import unittest
+
+from modelscope.task_datasets.veco_dataset import VecoDataset
+from modelscope.utils.test_utils import test_level
+
+
+class TestVecoDataset(unittest.TestCase):
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_veco_dataset_train(self):
+        from datasets import Dataset
+        d0 = Dataset.from_dict({'a': [0, 1, 2]})
+        d1 = Dataset.from_dict({'a': [10, 11, 12, 13, 14]})
+        d2 = Dataset.from_dict({'a': [21, 22, 23, 24, 25, 26, 27]})
+        dataset = VecoDataset([d0, d1, d2], mode='train')
+        self.assertEqual(len(dataset), 15)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_veco_dataset_eval(self):
+        from datasets import Dataset
+        d0 = Dataset.from_dict({'a': [0, 1, 2]})
+        d1 = Dataset.from_dict({'a': [10, 11, 12, 13, 14]})
+        d2 = Dataset.from_dict({'a': [21, 22, 23, 24, 25, 26, 27]})
+        dataset = VecoDataset([d0, d1, d2], mode='eval')
+        self.assertEqual(len(dataset), 3)
+        dataset.switch_dataset(1)
+        self.assertEqual(len(dataset), 5)
+        dataset.switch_dataset(2)
+        self.assertEqual(len(dataset), 7)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/trainers/hooks/test_lr_scheduler_hook.py b/tests/trainers/hooks/test_lr_scheduler_hook.py
index afb887a4..7e057ff0 100644
--- a/tests/trainers/hooks/test_lr_scheduler_hook.py
+++ b/tests/trainers/hooks/test_lr_scheduler_hook.py
@@ -270,6 +270,7 @@ class PlateauLrSchedulerHookTest(unittest.TestCase):
         trainer = build_trainer(trainer_name, kwargs)
         train_dataloader = trainer._build_dataloader_with_dataset(
             trainer.train_dataset, **trainer.cfg.train.get('dataloader', {}))
+        trainer.train_dataloader = train_dataloader
         trainer.data_loader = train_dataloader
         trainer.register_optimizers_hook()
         trainer.register_hook_from_cfg(trainer.cfg.train.hooks)
diff --git a/tests/trainers/test_finetune_sequence_classification.py b/tests/trainers/test_finetune_sequence_classification.py
new file mode 100644
index 00000000..8e147f92
--- /dev/null
+++ b/tests/trainers/test_finetune_sequence_classification.py
@@ -0,0 +1,244 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import shutil
+import tempfile
+import unittest
+
+from modelscope.trainers import build_trainer
+
+
+class TestFinetuneSequenceClassification(unittest.TestCase):
+
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp_dir)
+        super().tearDown()
+
+    def finetune(self,
+                 model_id,
+                 train_dataset,
+                 eval_dataset,
+                 name='NlpEpochBasedTrainer',
+                 cfg_modify_fn=None,
+                 **kwargs):
+        kwargs = dict(
+            model=model_id,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            work_dir=self.tmp_dir,
+            cfg_modify_fn=cfg_modify_fn,
+            **kwargs)
+
+        os.environ['LOCAL_RANK'] = '0'
+        trainer = build_trainer(name=name, default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        for i in range(10):
+            self.assertIn(f'epoch_{i+1}.pth', results_files)
+
+    @unittest.skip
+    def test_finetune_afqmc(self):
+
+        def cfg_modify_fn(cfg):
+            cfg.task = 'sentence-similarity'
+            cfg['preprocessor'] = {'type': 'sen-sim-tokenizer'}
+            cfg.train.optimizer.lr = 2e-5
+            cfg['dataset'] = {
+                'train': {
+                    'labels': ['0', '1'],
+                    'first_sequence': 'sentence1',
+                    'second_sequence': 'sentence2',
+                    'label': 'label',
+                }
+            }
+            cfg.train.max_epochs = 10
+            cfg.train.lr_scheduler = {
+                'type': 'LinearLR',
+                'start_factor': 1.0,
+                'end_factor': 0.0,
+                'total_iters':
+                int(len(dataset['train']) / 32) * cfg.train.max_epochs,
+                'options': {
+                    'by_epoch': False
+                }
+            }
+            cfg.train.hooks = [{
+                'type': 'CheckpointHook',
+                'interval': 1
+            }, {
+                'type': 'TextLoggerHook',
+                'interval': 1
+            }, {
+                'type': 'IterTimerHook'
+            }, {
+                'type': 'EvaluationHook',
+                'by_epoch': False,
+                'interval': 100
+            }]
+            return cfg
+
+        from datasets import load_dataset
+        from datasets import DownloadConfig
+        dc = DownloadConfig()
+        dc.local_files_only = True
+        dataset = load_dataset('clue', 'afqmc', download_config=dc)
+        self.finetune(
+            model_id='damo/nlp_structbert_backbone_tiny_std',
+            train_dataset=dataset['train'],
+            eval_dataset=dataset['validation'],
+            cfg_modify_fn=cfg_modify_fn)
+
+    @unittest.skip
+    def test_finetune_tnews(self):
+
+        def cfg_modify_fn(cfg):
+            # TODO no proper task for tnews
+            cfg.task = 'nli'
+            cfg['preprocessor'] = {'type': 'nli-tokenizer'}
+            cfg.train.optimizer.lr = 2e-5
+            cfg['dataset'] = {
+                'train': {
+                    'labels': [
+                        '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
+                        '11', '12', '13', '14'
+                    ],
+                    'first_sequence':
+                    'sentence',
+                    'label':
+                    'label',
+                }
+            }
+            cfg.train.max_epochs = 5
+            cfg.train.lr_scheduler = {
+                'type': 'LinearLR',
+                'start_factor': 1.0,
+                'end_factor': 0.0,
+                'total_iters':
+                int(len(dataset['train']) / 32) * cfg.train.max_epochs,
+                'options': {
+                    'by_epoch': False
+                }
+            }
+            cfg.train.hooks = [{
+                'type': 'CheckpointHook',
+                'interval': 1
+            }, {
+                'type': 'TextLoggerHook',
+                'interval': 1
+            }, {
+                'type': 'IterTimerHook'
+            }, {
+                'type': 'EvaluationHook',
+                'by_epoch': False,
+                'interval': 100
+            }]
+            return cfg
+
+        from datasets import load_dataset
+        from datasets import DownloadConfig
+        dc = DownloadConfig()
+        dc.local_files_only = True
+        dataset = load_dataset('clue', 'tnews', download_config=dc)
+
+        self.finetune(
+            model_id='damo/nlp_structbert_backbone_tiny_std',
+            train_dataset=dataset['train'],
+            eval_dataset=dataset['validation'],
+            cfg_modify_fn=cfg_modify_fn)
+
+    @unittest.skip
+    def test_veco_xnli(self):
+        from datasets import load_dataset
+        langs = ['en']
+        langs_eval = ['en']
+        train_datasets = []
+        from datasets import DownloadConfig
+        dc = DownloadConfig()
+        dc.local_files_only = True
+        for lang in langs:
+            train_datasets.append(
+                load_dataset('xnli', lang, split='train', download_config=dc))
+        eval_datasets = []
+        for lang in langs_eval:
+            eval_datasets.append(
+                load_dataset(
+                    'xnli', lang, split='validation', download_config=dc))
+        train_len = sum([len(dataset) for dataset in train_datasets])
+        labels = ['0', '1', '2']
+
+        def cfg_modify_fn(cfg):
+            cfg.task = 'nli'
+            cfg['preprocessor'] = {'type': 'nli-tokenizer'}
+            cfg['dataset'] = {
+                'train': {
+                    'first_sequence': 'premise',
+                    'second_sequence': 'hypothesis',
+                    'labels': labels,
+                    'label': 'label',
+                }
+            }
+            cfg['train'] = {
+                'work_dir':
+                '/tmp',
+                'max_epochs':
+                2,
+                'dataloader': {
+                    'batch_size_per_gpu': 16,
+                    'workers_per_gpu': 1
+                },
+                'optimizer': {
+                    'type': 'AdamW',
+                    'lr': 2e-5,
+                    'options': {
+                        'cumulative_iters': 8,
+                    }
+                },
+                'lr_scheduler': {
+                    'type': 'LinearLR',
+                    'start_factor': 1.0,
+                    'end_factor': 0.0,
+                    'total_iters': int(train_len / 16) * 2,
+                    'options': {
+                        'by_epoch': False
+                    }
+                },
+                'hooks': [{
+                    'type': 'CheckpointHook',
+                    'interval': 1,
+                    'save_dir': '/root'
+                }, {
+                    'type': 'TextLoggerHook',
+                    'interval': 1
+                }, {
+                    'type': 'IterTimerHook'
+                }, {
+                    'type': 'EvaluationHook',
+                    'by_epoch': False,
+                    'interval': 500
+                }]
+            }
+            cfg['evaluation'] = {
+                'dataloader': {
+                    'batch_size_per_gpu': 128,
+                    'workers_per_gpu': 1,
+                    'shuffle': False
+                }
+            }
+            return cfg
+
+        self.finetune(
+            'damo/nlp_veco_fill-mask-large',
+            train_datasets,
+            eval_datasets,
+            name='VecoTrainer',
+            cfg_modify_fn=cfg_modify_fn)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/trainers/test_finetune_token_classificatin.py b/tests/trainers/test_finetune_token_classificatin.py
new file mode 100644
index 00000000..7449bc69
--- /dev/null
+++ b/tests/trainers/test_finetune_token_classificatin.py
@@ -0,0 +1,200 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import shutil
+import tempfile
+import unittest
+from functools import reduce
+
+from modelscope.trainers import build_trainer
+from modelscope.utils.test_utils import test_level
+
+
+class TestFinetuneTokenClassification(unittest.TestCase):
+
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp_dir)
+        super().tearDown()
+
+    def finetune(self,
+                 model_id,
+                 train_dataset,
+                 eval_dataset,
+                 name='NlpEpochBasedTrainer',
+                 cfg_modify_fn=None,
+                 **kwargs):
+        kwargs = dict(
+            model=model_id,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            work_dir=self.tmp_dir,
+            cfg_modify_fn=cfg_modify_fn,
+            **kwargs)
+
+        os.environ['LOCAL_RANK'] = '0'
+        trainer = build_trainer(name=name, default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        for i in range(10):
+            self.assertIn(f'epoch_{i+1}.pth', results_files)
+
+    @unittest.skip
+    def test_token_classification(self):
+        # WS task
+        os.system(
+            f'curl http://dingkun.oss-cn-hangzhou-zmf.aliyuncs.com/atemp/train.txt > {self.tmp_dir}/train.txt'
+        )
+        os.system(
+            f'curl http://dingkun.oss-cn-hangzhou-zmf.aliyuncs.com/atemp/dev.txt > {self.tmp_dir}/dev.txt'
+        )
+        from datasets import load_dataset
+        dataset = load_dataset(
+            'text',
+            data_files={
+                'train': f'{self.tmp_dir}/train.txt',
+                'test': f'{self.tmp_dir}/dev.txt'
+            })
+
+        def split_to_dict(examples):
+            text, label = examples['text'].split('\t')
+            return {
+                'first_sequence': text.split(' '),
+                'labels': label.split(' ')
+            }
+
+        dataset = dataset.map(split_to_dict, batched=False)
+
+        def reducer(x, y):
+            x = x.split(' ') if isinstance(x, str) else x
+            y = y.split(' ') if isinstance(y, str) else y
+            return x + y
+
+        label_enumerate_values = list(
+            set(reduce(reducer, dataset['train'][:1000]['labels'])))
+        label_enumerate_values.sort()
+
+        def cfg_modify_fn(cfg):
+            cfg.task = 'token-classification'
+            cfg['preprocessor'] = {'type': 'token-cls-tokenizer'}
+            cfg['dataset'] = {
+                'train': {
+                    'labels': label_enumerate_values,
+                    'first_sequence': 'first_sequence',
+                    'label': 'labels',
+                }
+            }
+            cfg.train.max_epochs = 3
+            cfg.train.lr_scheduler = {
+                'type': 'LinearLR',
+                'start_factor': 1.0,
+                'end_factor': 0.0,
+                'total_iters':
+                int(len(dataset['train']) / 32) * cfg.train.max_epochs,
+                'options': {
+                    'by_epoch': False
+                }
+            }
+            cfg.train.hooks = [{
+                'type': 'CheckpointHook',
+                'interval': 1
+            }, {
+                'type': 'TextLoggerHook',
+                'interval': 1
+            }, {
+                'type': 'IterTimerHook'
+            }, {
+                'type': 'EvaluationHook',
+                'by_epoch': False,
+                'interval': 300
+            }]
+            return cfg
+
+        self.finetune(
+            'damo/nlp_structbert_backbone_tiny_std',
+            dataset['train'],
+            dataset['test'],
+            cfg_modify_fn=cfg_modify_fn)
+
+    @unittest.skip
+    def test_word_segmentation(self):
+        os.system(
+            f'curl http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip > {self.tmp_dir}/icwb2-data.zip'
+        )
+        shutil.unpack_archive(f'{self.tmp_dir}/icwb2-data.zip', self.tmp_dir)
+        from datasets import load_dataset
+        from modelscope.preprocessors.nlp import WordSegmentationBlankSetToLabelPreprocessor
+        preprocessor = WordSegmentationBlankSetToLabelPreprocessor()
+        dataset = load_dataset(
+            'text',
+            data_files=f'{self.tmp_dir}/icwb2-data/training/pku_training.utf8')
+
+        def split_to_dict(examples):
+            return preprocessor(examples['text'])
+
+        dataset = dataset.map(split_to_dict, batched=False)
+
+        def reducer(x, y):
+            x = x.split(' ') if isinstance(x, str) else x
+            y = y.split(' ') if isinstance(y, str) else y
+            return x + y
+
+        label_enumerate_values = list(
+            set(reduce(reducer, dataset['train'][:1000]['labels'])))
+        label_enumerate_values.sort()
+
+        train_len = int(len(dataset['train']) * 0.7)
+        train_dataset = dataset['train'].select(range(train_len))
+        dev_dataset = dataset['train'].select(
+            range(train_len, len(dataset['train'])))
+
+        def cfg_modify_fn(cfg):
+            cfg.task = 'token-classification'
+            cfg['dataset'] = {
+                'train': {
+                    'labels': label_enumerate_values,
+                    'first_sequence': 'first_sequence',
+                    'label': 'labels',
+                }
+            }
+            cfg['preprocessor'] = {'type': 'token-cls-tokenizer'}
+            cfg.train.max_epochs = 3
+            cfg.train.lr_scheduler = {
+                'type': 'LinearLR',
+                'start_factor': 1.0,
+                'end_factor': 0.0,
+                'total_iters':
+                int(len(train_dataset) / 32) * cfg.train.max_epochs,
+                'options': {
+                    'by_epoch': False
+                }
+            }
+            cfg.train.hooks = [{
+                'type': 'CheckpointHook',
+                'interval': 1
+            }, {
+                'type': 'TextLoggerHook',
+                'interval': 1
+            }, {
+                'type': 'IterTimerHook'
+            }, {
+                'type': 'EvaluationHook',
+                'by_epoch': False,
+                'interval': 50
+            }]
+            return cfg
+
+        self.finetune(
+            'damo/nlp_structbert_backbone_tiny_std',
+            train_dataset,
+            dev_dataset,
+            cfg_modify_fn=cfg_modify_fn)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/trainers/test_text_generation_trainer.py b/tests/trainers/test_text_generation_trainer.py
index 7c24bc0a..9c79f2f5 100644
--- a/tests/trainers/test_text_generation_trainer.py
+++ b/tests/trainers/test_text_generation_trainer.py
@@ -5,8 +5,7 @@ import tempfile
 import unittest
 
 from modelscope.hub.snapshot_download import snapshot_download
-from modelscope.models.nlp.palm_for_text_generation import \
-    PalmForTextGeneration
+from modelscope.models.nlp.palm_v2 import PalmForTextGeneration
 from modelscope.msdatasets import MsDataset
 from modelscope.trainers import build_trainer
 from modelscope.utils.constant import ModelFile
@@ -50,13 +49,21 @@ class TestTextGenerationTrainer(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_trainer(self):
+
+        def cfg_modify_fn(cfg):
+            cfg.preprocessor.type = 'text-gen-tokenizer'
+            return cfg
+
         kwargs = dict(
             model=self.model_id,
             train_dataset=self.dataset,
             eval_dataset=self.dataset,
-            work_dir=self.tmp_dir)
+            work_dir=self.tmp_dir,
+            cfg_modify_fn=cfg_modify_fn,
+            model_revision='beta')
 
-        trainer = build_trainer(default_args=kwargs)
+        trainer = build_trainer(
+            name='NlpEpochBasedTrainer', default_args=kwargs)
         trainer.train()
         results_files = os.listdir(self.tmp_dir)
         self.assertIn(f'{trainer.timestamp}.log.json', results_files)
@@ -69,7 +76,7 @@ class TestTextGenerationTrainer(unittest.TestCase):
         if not os.path.exists(tmp_dir):
             os.makedirs(tmp_dir)
 
-        cache_path = snapshot_download(self.model_id)
+        cache_path = snapshot_download(self.model_id, revision='beta')
         model = PalmForTextGeneration.from_pretrained(cache_path)
         kwargs = dict(
             cfg_file=os.path.join(cache_path, ModelFile.CONFIGURATION),
@@ -86,6 +93,44 @@ class TestTextGenerationTrainer(unittest.TestCase):
         for i in range(2):
             self.assertIn(f'epoch_{i+1}.pth', results_files)
 
+    @unittest.skip
+    def test_finetune_cnndm(self):
+        from datasets import load_dataset
+        dataset_dict = load_dataset('ccdv/cnn_dailymail', '3.0.0')
+        train_dataset = dataset_dict['train'] \
+            .rename_columns({'article': 'src_txt', 'highlights': 'tgt_txt'}) \
+            .remove_columns('id')
+        eval_dataset = dataset_dict['validation'] \
+            .rename_columns({'article': 'src_txt', 'highlights': 'tgt_txt'}) \
+            .remove_columns('id')
+        num_warmup_steps = 2000
+
+        def noam_lambda(current_step: int):
+            current_step += 1
+            return min(current_step**(-0.5),
+                       current_step * num_warmup_steps**(-1.5))
+
+        def cfg_modify_fn(cfg):
+            cfg.train.lr_scheduler = {
+                'type': 'LambdaLR',
+                'lr_lambda': noam_lambda,
+                'options': {
+                    'by_epoch': False
+                }
+            }
+            return cfg
+
+        kwargs = dict(
+            model=self.model_id,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            work_dir=self.tmp_dir,
+            cfg_modify_fn=cfg_modify_fn,
+            model_revision='beta')
+        trainer = build_trainer(
+            name='NlpEpochBasedTrainer', default_args=kwargs)
+        trainer.train()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/trainers/test_trainer_with_nlp.py b/tests/trainers/test_trainer_with_nlp.py
index 603d6e5b..a2d899ba 100644
--- a/tests/trainers/test_trainer_with_nlp.py
+++ b/tests/trainers/test_trainer_with_nlp.py
@@ -6,8 +6,8 @@ import unittest
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.metainfo import Metrics
-from modelscope.models.nlp.sbert_for_sequence_classification import \
-    SbertTextClassfier
+from modelscope.models.nlp.sequence_classification import \
+    SbertForSequenceClassification
 from modelscope.msdatasets import MsDataset
 from modelscope.trainers import build_trainer
 from modelscope.utils.constant import ModelFile
@@ -102,7 +102,7 @@ class TestTrainerWithNlp(unittest.TestCase):
 
         model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base'
         cache_path = snapshot_download(model_id)
-        model = SbertTextClassfier.from_pretrained(cache_path)
+        model = SbertForSequenceClassification.from_pretrained(cache_path)
         kwargs = dict(
             cfg_file=os.path.join(cache_path, ModelFile.CONFIGURATION),
             model=model,