diff --git a/.dev_scripts/dockerci.sh b/.dev_scripts/dockerci.sh
index e76f2f14..af94b211 100644
--- a/.dev_scripts/dockerci.sh
+++ b/.dev_scripts/dockerci.sh
@@ -36,6 +36,7 @@ do
              -e TEST_ACCESS_TOKEN_SDKDEV=$TEST_ACCESS_TOKEN_SDKDEV \
              -e TEST_LEVEL=$TEST_LEVEL \
              -e TEST_UPLOAD_MS_TOKEN=$TEST_UPLOAD_MS_TOKEN \
+             -e MODEL_TAG_URL=$MODEL_TAG_URL \
              --workdir=$CODE_DIR_IN_CONTAINER \
              --net host  \
              ${IMAGE_NAME}:${IMAGE_VERSION} \
diff --git a/data/test/images/mog_face_detection.jpg b/data/test/images/mog_face_detection.jpg
new file mode 100644
index 00000000..c95881fe
--- /dev/null
+++ b/data/test/images/mog_face_detection.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:176c824d99af119b36f743d3d90b44529167b0e4fc6db276da60fa140ee3f4a9
+size 87228
diff --git a/data/test/images/mtcnn_face_detection.jpg b/data/test/images/mtcnn_face_detection.jpg
new file mode 100644
index 00000000..c95881fe
--- /dev/null
+++ b/data/test/images/mtcnn_face_detection.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:176c824d99af119b36f743d3d90b44529167b0e4fc6db276da60fa140ee3f4a9
+size 87228
diff --git a/data/test/images/multimodal_similarity.jpg b/data/test/images/multimodal_similarity.jpg
new file mode 100644
index 00000000..70a2b844
--- /dev/null
+++ b/data/test/images/multimodal_similarity.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1f24abbba43782d733dedbb0b4f416635af50263862e5632963ac9263e430555
+size 88542
diff --git a/data/test/images/ulfd_face_detection.jpg b/data/test/images/ulfd_face_detection.jpg
new file mode 100644
index 00000000..c95881fe
--- /dev/null
+++ b/data/test/images/ulfd_face_detection.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:176c824d99af119b36f743d3d90b44529167b0e4fc6db276da60fa140ee3f4a9
+size 87228
diff --git a/data/test/videos/mask_dir/mask_00000_00320.png b/data/test/videos/mask_dir/mask_00000_00320.png
new file mode 100644
index 00000000..2eae71a1
--- /dev/null
+++ b/data/test/videos/mask_dir/mask_00000_00320.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b158f6029d9763d7f84042f7c5835f398c688fdbb6b3f4fe6431101d4118c66c
+size 2766
diff --git a/data/test/videos/mask_dir/mask_00321_00633.png b/data/test/videos/mask_dir/mask_00321_00633.png
new file mode 100644
index 00000000..89633eb6
--- /dev/null
+++ b/data/test/videos/mask_dir/mask_00321_00633.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0dcf46b93077e2229ab69cd6ddb80e2689546c575ee538bb2033fee1124ef3e3
+size 2761
diff --git a/data/test/videos/video_inpainting_test.mp4 b/data/test/videos/video_inpainting_test.mp4
new file mode 100644
index 00000000..61f96fac
--- /dev/null
+++ b/data/test/videos/video_inpainting_test.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9c9870df5a86acaaec67063183dace795479cd0f05296f13058995f475149c56
+size 2957783
diff --git a/docker/Dockerfile.ubuntu b/docker/Dockerfile.ubuntu
index 78da0b6f..e0bfa908 100644
--- a/docker/Dockerfile.ubuntu
+++ b/docker/Dockerfile.ubuntu
@@ -75,7 +75,9 @@ RUN pip install --no-cache-dir --upgrade pip && \
 ENV SHELL=/bin/bash
 
 # install special package
-RUN pip install --no-cache-dir mmcls>=0.21.0 mmdet>=2.25.0 decord>=0.6.0 numpy==1.18.5 datasets==2.1.0
+RUN pip install --no-cache-dir mmcls>=0.21.0 mmdet>=2.25.0 decord>=0.6.0 datasets==2.1.0 ipykernel && \
+    pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple && \
+    pip config set install.trusted-host pypi.tuna.tsinghua.edu.cn
 
 RUN if [ "$USE_GPU" = "True" ] ; then \
         pip install --no-cache-dir dgl-cu113 dglgo -f https://data.dgl.ai/wheels/repo.html; \
diff --git a/modelscope/exporters/__init__.py b/modelscope/exporters/__init__.py
new file mode 100644
index 00000000..a597114f
--- /dev/null
+++ b/modelscope/exporters/__init__.py
@@ -0,0 +1,4 @@
+from .base import Exporter
+from .builder import build_exporter
+from .nlp import SbertForSequenceClassificationExporter
+from .torch_model_exporter import TorchModelExporter
diff --git a/modelscope/exporters/base.py b/modelscope/exporters/base.py
new file mode 100644
index 00000000..f19d2bbb
--- /dev/null
+++ b/modelscope/exporters/base.py
@@ -0,0 +1,53 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from abc import ABC, abstractmethod
+
+from modelscope.models import Model
+from modelscope.utils.config import Config, ConfigDict
+from modelscope.utils.constant import ModelFile
+from .builder import build_exporter
+
+
+class Exporter(ABC):
+    """Exporter base class to output model to onnx, torch_script, graphdef, etc.
+    """
+
+    def __init__(self):
+        self.model = None
+
+    @classmethod
+    def from_model(cls, model: Model, **kwargs):
+        """Build the Exporter instance.
+
+        @param model: A model instance. it will be used to output the generated file,
+            and the configuration.json in its model_dir field will be used to create the exporter instance.
+        @param kwargs: Extra kwargs used to create the Exporter instance.
+        @return: The Exporter instance
+        """
+        cfg = Config.from_file(
+            os.path.join(model.model_dir, ModelFile.CONFIGURATION))
+        task_name = cfg.task
+        model_cfg = cfg.model
+        if hasattr(model_cfg, 'model_type') and not hasattr(model_cfg, 'type'):
+            model_cfg.type = model_cfg.model_type
+        export_cfg = ConfigDict({'type': model_cfg.type})
+        if hasattr(cfg, 'export'):
+            export_cfg.update(cfg.export)
+        exporter = build_exporter(export_cfg, task_name, kwargs)
+        exporter.model = model
+        return exporter
+
+    @abstractmethod
+    def export_onnx(self, outputs: str, opset=11, **kwargs):
+        """Export the model as onnx format files.
+
+        In some cases,  several files may be generated,
+        So please return a dict which contains the generated name with the file path.
+
+        @param opset: The version of the ONNX operator set to use.
+        @param outputs: The output dir.
+        @param kwargs: In this default implementation,
+            kwargs will be carried to generate_dummy_inputs as extra arguments (like input shape).
+        @return: A dict contains the model name with the model file path.
+        """
+        pass
diff --git a/modelscope/exporters/builder.py b/modelscope/exporters/builder.py
new file mode 100644
index 00000000..90699c12
--- /dev/null
+++ b/modelscope/exporters/builder.py
@@ -0,0 +1,21 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from modelscope.utils.config import ConfigDict
+from modelscope.utils.registry import Registry, build_from_cfg
+
+EXPORTERS = Registry('exporters')
+
+
+def build_exporter(cfg: ConfigDict,
+                   task_name: str = None,
+                   default_args: dict = None):
+    """ build exporter by the given model config dict
+
+    Args:
+        cfg (:obj:`ConfigDict`): config dict for exporter object.
+        task_name (str, optional):  task name, refer to
+            :obj:`Tasks` for more details
+        default_args (dict, optional): Default initialization arguments.
+    """
+    return build_from_cfg(
+        cfg, EXPORTERS, group_key=task_name, default_args=default_args)
diff --git a/modelscope/exporters/nlp/__init__.py b/modelscope/exporters/nlp/__init__.py
new file mode 100644
index 00000000..fdfd2711
--- /dev/null
+++ b/modelscope/exporters/nlp/__init__.py
@@ -0,0 +1,2 @@
+from .sbert_for_sequence_classification_exporter import \
+    SbertForSequenceClassificationExporter
diff --git a/modelscope/exporters/nlp/sbert_for_sequence_classification_exporter.py b/modelscope/exporters/nlp/sbert_for_sequence_classification_exporter.py
new file mode 100644
index 00000000..dc1e2b92
--- /dev/null
+++ b/modelscope/exporters/nlp/sbert_for_sequence_classification_exporter.py
@@ -0,0 +1,81 @@
+import os
+from collections import OrderedDict
+from typing import Any, Dict, Mapping, Tuple
+
+from torch.utils.data.dataloader import default_collate
+
+from modelscope.exporters.builder import EXPORTERS
+from modelscope.exporters.torch_model_exporter import TorchModelExporter
+from modelscope.metainfo import Models
+from modelscope.preprocessors import Preprocessor, build_preprocessor
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModeKeys, Tasks
+
+
+@EXPORTERS.register_module(
+    Tasks.sentence_similarity, module_name=Models.structbert)
+@EXPORTERS.register_module(
+    Tasks.sentiment_classification, module_name=Models.structbert)
+@EXPORTERS.register_module(Tasks.nli, module_name=Models.structbert)
+@EXPORTERS.register_module(
+    Tasks.zero_shot_classification, module_name=Models.structbert)
+class SbertForSequenceClassificationExporter(TorchModelExporter):
+
+    def generate_dummy_inputs(self,
+                              shape: Tuple = None,
+                              **kwargs) -> Dict[str, Any]:
+        """Generate dummy inputs for model exportation to onnx or other formats by tracing.
+
+        @param shape: A tuple of input shape which should have at most two dimensions.
+            shape = (1, ) batch_size=1, sequence_length will be taken from the preprocessor.
+            shape = (8, 128) batch_size=1, sequence_length=128, which will cover the config of the preprocessor.
+        @return: Dummy inputs.
+        """
+
+        cfg = Config.from_file(
+            os.path.join(self.model.model_dir, 'configuration.json'))
+        field_name = Tasks.find_field_by_task(cfg.task)
+        if 'type' not in cfg.preprocessor and 'val' in cfg.preprocessor:
+            cfg = cfg.preprocessor.val
+        else:
+            cfg = cfg.preprocessor
+
+        batch_size = 1
+        sequence_length = {}
+        if shape is not None:
+            if len(shape) == 1:
+                batch_size = shape[0]
+            elif len(shape) == 2:
+                batch_size, max_length = shape
+                sequence_length = {'sequence_length': max_length}
+
+        cfg.update({
+            'model_dir': self.model.model_dir,
+            'mode': ModeKeys.TRAIN,
+            **sequence_length
+        })
+        preprocessor: Preprocessor = build_preprocessor(cfg, field_name)
+        if preprocessor.pair:
+            first_sequence = preprocessor.tokenizer.unk_token
+            second_sequence = preprocessor.tokenizer.unk_token
+        else:
+            first_sequence = preprocessor.tokenizer.unk_token
+            second_sequence = None
+
+        batched = []
+        for _ in range(batch_size):
+            batched.append(preprocessor((first_sequence, second_sequence)))
+        return default_collate(batched)
+
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        dynamic_axis = {0: 'batch', 1: 'sequence'}
+        return OrderedDict([
+            ('input_ids', dynamic_axis),
+            ('attention_mask', dynamic_axis),
+            ('token_type_ids', dynamic_axis),
+        ])
+
+    @property
+    def outputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict({'logits': {0: 'batch'}})
diff --git a/modelscope/exporters/torch_model_exporter.py b/modelscope/exporters/torch_model_exporter.py
new file mode 100644
index 00000000..98a23fe5
--- /dev/null
+++ b/modelscope/exporters/torch_model_exporter.py
@@ -0,0 +1,247 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from contextlib import contextmanager
+from itertools import chain
+from typing import Any, Dict, Mapping
+
+import torch
+from torch import nn
+from torch.onnx import export as onnx_export
+from torch.onnx.utils import _decide_input_format
+
+from modelscope.models import TorchModel
+from modelscope.pipelines.base import collate_fn
+from modelscope.utils.constant import ModelFile
+from modelscope.utils.logger import get_logger
+from modelscope.utils.regress_test_utils import compare_arguments_nested
+from modelscope.utils.tensor_utils import torch_nested_numpify
+from .base import Exporter
+
+logger = get_logger(__name__)
+
+
+class TorchModelExporter(Exporter):
+    """The torch base class of exporter.
+
+    This class provides the default implementations for exporting onnx and torch script.
+    Each specific model may implement its own exporter by overriding the export_onnx/export_torch_script,
+    and to provide implementations for generate_dummy_inputs/inputs/outputs methods.
+    """
+
+    def export_onnx(self, outputs: str, opset=11, **kwargs):
+        """Export the model as onnx format files.
+
+        In some cases,  several files may be generated,
+        So please return a dict which contains the generated name with the file path.
+
+        @param opset: The version of the ONNX operator set to use.
+        @param outputs: The output dir.
+        @param kwargs: In this default implementation,
+            you can pass the arguments needed by _torch_export_onnx, other unrecognized args
+            will be carried to generate_dummy_inputs as extra arguments (such as input shape).
+        @return: A dict containing the model key - model file path pairs.
+        """
+        model = self.model
+        if not isinstance(model, nn.Module) and hasattr(model, 'model'):
+            model = model.model
+        onnx_file = os.path.join(outputs, ModelFile.ONNX_MODEL_FILE)
+        self._torch_export_onnx(model, onnx_file, opset=opset, **kwargs)
+        return {'model': onnx_file}
+
+    def export_torch_script(self, outputs: str, **kwargs):
+        """Export the model as torch script files.
+
+        In some cases,  several files may be generated,
+        So please return a dict which contains the generated name with the file path.
+
+        @param outputs: The output dir.
+        @param kwargs: In this default implementation,
+            you can pass the arguments needed by _torch_export_torch_script, other unrecognized args
+            will be carried to generate_dummy_inputs as extra arguments (like input shape).
+        @return: A dict contains the model name with the model file path.
+        """
+        model = self.model
+        if not isinstance(model, nn.Module) and hasattr(model, 'model'):
+            model = model.model
+        ts_file = os.path.join(outputs, ModelFile.TS_MODEL_FILE)
+        # generate ts by tracing
+        self._torch_export_torch_script(model, ts_file, **kwargs)
+        return {'model': ts_file}
+
+    def generate_dummy_inputs(self, **kwargs) -> Dict[str, Any]:
+        """Generate dummy inputs for model exportation to onnx or other formats by tracing.
+        @return: Dummy inputs.
+        """
+        return None
+
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        """Return an ordered dict contains the model's input arguments name with their dynamic axis.
+
+        About the information of dynamic axis please check the dynamic_axes argument of torch.onnx.export function
+        """
+        return None
+
+    @property
+    def outputs(self) -> Mapping[str, Mapping[int, str]]:
+        """Return an ordered dict contains the model's output arguments name with their dynamic axis.
+
+        About the information of dynamic axis please check the dynamic_axes argument of torch.onnx.export function
+        """
+        return None
+
+    def _torch_export_onnx(self,
+                           model: nn.Module,
+                           output: str,
+                           opset: int = 11,
+                           device: str = 'cpu',
+                           validation: bool = True,
+                           rtol: float = None,
+                           atol: float = None,
+                           **kwargs):
+        """Export the model to an onnx format file.
+
+        @param model: A torch.nn.Module instance to export.
+        @param output: The output file.
+        @param opset: The version of the ONNX operator set to use.
+        @param device: The device used to forward.
+        @param validation: Whether validate the export file.
+        @param rtol: The rtol used to regress the outputs.
+        @param atol: The atol used to regress the outputs.
+        """
+
+        dummy_inputs = self.generate_dummy_inputs(**kwargs)
+        inputs = self.inputs
+        outputs = self.outputs
+        if dummy_inputs is None or inputs is None or outputs is None:
+            raise NotImplementedError(
+                'Model property dummy_inputs,inputs,outputs must be set.')
+
+        with torch.no_grad():
+            model.eval()
+            device = torch.device(device)
+            model.to(device)
+            dummy_inputs = collate_fn(dummy_inputs, device)
+
+            if isinstance(dummy_inputs, Mapping):
+                dummy_inputs = dict(dummy_inputs)
+            onnx_outputs = list(self.outputs.keys())
+
+            with replace_call():
+                onnx_export(
+                    model,
+                    (dummy_inputs, ),
+                    f=output,
+                    input_names=list(inputs.keys()),
+                    output_names=onnx_outputs,
+                    dynamic_axes={
+                        name: axes
+                        for name, axes in chain(inputs.items(),
+                                                outputs.items())
+                    },
+                    do_constant_folding=True,
+                    opset_version=opset,
+                )
+
+        if validation:
+            try:
+                import onnx
+                import onnxruntime as ort
+            except ImportError:
+                logger.warn(
+                    'Cannot validate the exported onnx file, because '
+                    'the installation of onnx or onnxruntime cannot be found')
+                return
+            onnx_model = onnx.load(output)
+            onnx.checker.check_model(onnx_model)
+            ort_session = ort.InferenceSession(output)
+            with torch.no_grad():
+                model.eval()
+                outputs_origin = model.forward(
+                    *_decide_input_format(model, dummy_inputs))
+            if isinstance(outputs_origin, Mapping):
+                outputs_origin = torch_nested_numpify(
+                    list(outputs_origin.values()))
+            outputs = ort_session.run(
+                onnx_outputs,
+                torch_nested_numpify(dummy_inputs),
+            )
+
+            tols = {}
+            if rtol is not None:
+                tols['rtol'] = rtol
+            if atol is not None:
+                tols['atol'] = atol
+            if not compare_arguments_nested('Onnx model output match failed',
+                                            outputs, outputs_origin, **tols):
+                raise RuntimeError(
+                    'export onnx failed because of validation error.')
+
+    def _torch_export_torch_script(self,
+                                   model: nn.Module,
+                                   output: str,
+                                   device: str = 'cpu',
+                                   validation: bool = True,
+                                   rtol: float = None,
+                                   atol: float = None,
+                                   **kwargs):
+        """Export the model to a torch script file.
+
+        @param model: A torch.nn.Module instance to export.
+        @param output: The output file.
+        @param device: The device used to forward.
+        @param validation: Whether validate the export file.
+        @param rtol: The rtol used to regress the outputs.
+        @param atol: The atol used to regress the outputs.
+        """
+
+        model.eval()
+        dummy_inputs = self.generate_dummy_inputs(**kwargs)
+        if dummy_inputs is None:
+            raise NotImplementedError(
+                'Model property dummy_inputs must be set.')
+        dummy_inputs = collate_fn(dummy_inputs, device)
+        if isinstance(dummy_inputs, Mapping):
+            dummy_inputs = tuple(dummy_inputs.values())
+        with torch.no_grad():
+            model.eval()
+            with replace_call():
+                traced_model = torch.jit.trace(
+                    model, dummy_inputs, strict=False)
+        torch.jit.save(traced_model, output)
+
+        if validation:
+            ts_model = torch.jit.load(output)
+            with torch.no_grad():
+                model.eval()
+                ts_model.eval()
+                outputs = ts_model.forward(*dummy_inputs)
+                outputs = torch_nested_numpify(outputs)
+                outputs_origin = model.forward(*dummy_inputs)
+                outputs_origin = torch_nested_numpify(outputs_origin)
+            tols = {}
+            if rtol is not None:
+                tols['rtol'] = rtol
+            if atol is not None:
+                tols['atol'] = atol
+            if not compare_arguments_nested(
+                    'Torch script model output match failed', outputs,
+                    outputs_origin, **tols):
+                raise RuntimeError(
+                    'export torch script failed because of validation error.')
+
+
+@contextmanager
+def replace_call():
+    """This function is used to recover the original call method.
+
+    The Model class of modelscope overrides the call method. When exporting to onnx or torchscript, torch will
+    prepare the parameters as the prototype of forward method, and trace the call method, this causes
+    problems. Here we recover the call method to the default implementation of torch.nn.Module, and change it
+    back after the tracing was done.
+    """
+
+    TorchModel.call_origin, TorchModel.__call__ = TorchModel.__call__, TorchModel._call_impl
+    yield
+    TorchModel.__call__ = TorchModel.call_origin
+    del TorchModel.call_origin
diff --git a/modelscope/fileio/__init__.py b/modelscope/fileio/__init__.py
index b526d593..385cd02c 100644
--- a/modelscope/fileio/__init__.py
+++ b/modelscope/fileio/__init__.py
@@ -1,2 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from .file import File, LocalStorage
 from .io import dump, dumps, load
diff --git a/modelscope/fileio/format/__init__.py b/modelscope/fileio/format/__init__.py
index 52e64279..68518266 100644
--- a/modelscope/fileio/format/__init__.py
+++ b/modelscope/fileio/format/__init__.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from .base import FormatHandler
 from .json import JsonHandler
 from .yaml import YamlHandler
diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py
index 721f5637..8dcfa5b0 100644
--- a/modelscope/hub/api.py
+++ b/modelscope/hub/api.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 import pickle
 import shutil
@@ -389,7 +391,7 @@ class HubApi:
         cookies = requests.utils.dict_from_cookiejar(cookies)
         r = requests.get(url=datahub_url, cookies=cookies)
         resp = r.json()
-        datahub_raise_on_error(datahub_url, resp)
+        raise_on_error(resp)
         return resp['Data']
 
     def on_dataset_download(self, dataset_name: str, namespace: str) -> None:
diff --git a/modelscope/hub/constants.py b/modelscope/hub/constants.py
index 014a1e59..c8664597 100644
--- a/modelscope/hub/constants.py
+++ b/modelscope/hub/constants.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from pathlib import Path
 
 MODELSCOPE_URL_SCHEME = 'http://'
diff --git a/modelscope/hub/errors.py b/modelscope/hub/errors.py
index e9c008b0..c095a6ec 100644
--- a/modelscope/hub/errors.py
+++ b/modelscope/hub/errors.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from http import HTTPStatus
 
 from requests.exceptions import HTTPError
@@ -60,7 +62,7 @@ def raise_on_error(rsp):
     Args:
         rsp (_type_): The server response
     """
-    if rsp['Code'] == HTTPStatus.OK and rsp['Success']:
+    if rsp['Code'] == HTTPStatus.OK:
         return True
     else:
         raise RequestError(rsp['Message'])
diff --git a/modelscope/hub/file_download.py b/modelscope/hub/file_download.py
index 5f15272c..1cc5645b 100644
--- a/modelscope/hub/file_download.py
+++ b/modelscope/hub/file_download.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import copy
 import os
 import sys
diff --git a/modelscope/hub/git.py b/modelscope/hub/git.py
index 08eec3ff..486f8df3 100644
--- a/modelscope/hub/git.py
+++ b/modelscope/hub/git.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 import subprocess
 from typing import List
@@ -39,17 +41,28 @@ class GitCommandWrapper(metaclass=Singleton):
             subprocess.CompletedProcess: the command response
         """
         logger.debug(' '.join(args))
+        git_env = os.environ.copy()
+        git_env['GIT_TERMINAL_PROMPT'] = '0'
         response = subprocess.run(
             [self.git_path, *args],
             stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE)  # compatible for python3.6
+            stderr=subprocess.PIPE,
+            env=git_env,
+        )  # compatible for python3.6
         try:
             response.check_returncode()
             return response
         except subprocess.CalledProcessError as error:
-            raise GitError(
-                'stdout: %s, stderr: %s' %
-                (response.stdout.decode('utf8'), error.stderr.decode('utf8')))
+            if response.returncode == 1:
+                logger.info('Nothing to commit.')
+                return response
+            else:
+                logger.error(
+                    'There are error run git command, you may need to login first.'
+                )
+                raise GitError('stdout: %s, stderr: %s' %
+                               (response.stdout.decode('utf8'),
+                                error.stderr.decode('utf8')))
 
     def config_auth_token(self, repo_dir, auth_token):
         url = self.get_repo_remote_url(repo_dir)
diff --git a/modelscope/hub/repository.py b/modelscope/hub/repository.py
index 6f560f7a..d92089ed 100644
--- a/modelscope/hub/repository.py
+++ b/modelscope/hub/repository.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 from typing import Optional
 
@@ -40,6 +42,11 @@ class Repository:
         self.model_dir = model_dir
         self.model_base_dir = os.path.dirname(model_dir)
         self.model_repo_name = os.path.basename(model_dir)
+
+        if not revision:
+            err_msg = 'a non-default value of revision cannot be empty.'
+            raise InvalidParameter(err_msg)
+
         if auth_token:
             self.auth_token = auth_token
         else:
@@ -145,10 +152,21 @@ class DatasetRepository:
                 The git command line path, if None, we use 'git'
         """
         self.dataset_id = dataset_id
-        self.repo_work_dir = repo_work_dir
-        self.repo_base_dir = os.path.dirname(repo_work_dir)
-        self.repo_name = os.path.basename(repo_work_dir)
+        if not repo_work_dir or not isinstance(repo_work_dir, str):
+            err_msg = 'dataset_work_dir must be provided!'
+            raise InvalidParameter(err_msg)
+        self.repo_work_dir = repo_work_dir.rstrip('/')
+        if not self.repo_work_dir:
+            err_msg = 'dataset_work_dir can not be root dir!'
+            raise InvalidParameter(err_msg)
+        self.repo_base_dir = os.path.dirname(self.repo_work_dir)
+        self.repo_name = os.path.basename(self.repo_work_dir)
+
+        if not revision:
+            err_msg = 'a non-default value of revision cannot be empty.'
+            raise InvalidParameter(err_msg)
         self.revision = revision
+
         if auth_token:
             self.auth_token = auth_token
         else:
@@ -199,7 +217,9 @@ class DatasetRepository:
         self.git_wrapper.config_auth_token(self.repo_work_dir, self.auth_token)
         self.git_wrapper.add_user_info(self.repo_base_dir, self.repo_name)
 
-        remote_url = self.git_wrapper.get_repo_remote_url(self.repo_work_dir)
+        remote_url = self._get_remote_url()
+        remote_url = self.git_wrapper.remove_token_from_url(remote_url)
+
         self.git_wrapper.pull(self.repo_work_dir)
         self.git_wrapper.add(self.repo_work_dir, all_files=True)
         self.git_wrapper.commit(self.repo_work_dir, commit_message)
diff --git a/modelscope/hub/snapshot_download.py b/modelscope/hub/snapshot_download.py
index c63d8956..cde6ad34 100644
--- a/modelscope/hub/snapshot_download.py
+++ b/modelscope/hub/snapshot_download.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 import tempfile
 from pathlib import Path
diff --git a/modelscope/hub/utils/caching.py b/modelscope/hub/utils/caching.py
index fc30fa27..1acd2e84 100644
--- a/modelscope/hub/utils/caching.py
+++ b/modelscope/hub/utils/caching.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import hashlib
 import os
 import pickle
diff --git a/modelscope/hub/utils/utils.py b/modelscope/hub/utils/utils.py
index 7e219d16..d84b78ea 100644
--- a/modelscope/hub/utils/utils.py
+++ b/modelscope/hub/utils/utils.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import hashlib
 import os
 from typing import Optional
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index c83200af..ca466bfd 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -35,6 +35,10 @@ class Models(object):
     fer = 'fer'
     retinaface = 'retinaface'
     shop_segmentation = 'shop-segmentation'
+    mogface = 'mogface'
+    mtcnn = 'mtcnn'
+    ulfd = 'ulfd'
+    video_inpainting = 'video-inpainting'
 
     # EasyCV models
     yolox = 'YOLOX'
@@ -51,11 +55,16 @@ class Models(object):
     space_intent = 'space-intent'
     space_modeling = 'space-modeling'
     star = 'star'
+    star3 = 'star3'
     tcrf = 'transformer-crf'
+    transformer_softmax = 'transformer-softmax'
     lcrf = 'lstm-crf'
+    gcnncrf = 'gcnn-crf'
     bart = 'bart'
     gpt3 = 'gpt3'
+    plug = 'plug'
     bert_for_ds = 'bert-for-document-segmentation'
+    ponet = 'ponet'
 
     # audio models
     sambert_hifigan = 'sambert-hifigan'
@@ -70,6 +79,7 @@ class Models(object):
     gemm = 'gemm-generative-multi-modal'
     mplug = 'mplug'
     diffusion = 'diffusion-text-to-image-synthesis'
+    multi_stage_diffusion = 'multi-stage-diffusion-text-to-image-synthesis'
     team = 'team-multi-modal-similarity'
     video_clip = 'video-clip-multi-modal-embedding'
 
@@ -77,6 +87,7 @@ class Models(object):
 class TaskModels(object):
     # nlp task
     text_classification = 'text-classification'
+    token_classification = 'token-classification'
     information_extraction = 'information-extraction'
 
 
@@ -87,6 +98,8 @@ class Heads(object):
     bert_mlm = 'bert-mlm'
     # roberta mlm
     roberta_mlm = 'roberta-mlm'
+    # token cls
+    token_classification = 'token-classification'
     information_extraction = 'information-extraction'
 
 
@@ -121,8 +134,11 @@ class Pipelines(object):
     salient_detection = 'u2net-salient-detection'
     image_classification = 'image-classification'
     face_detection = 'resnet-face-detection-scrfd10gkps'
+    ulfd_face_detection = 'manual-face-detection-ulfd'
     facial_expression_recognition = 'vgg19-facial-expression-recognition-fer'
     retina_face_detection = 'resnet50-face-detection-retinaface'
+    mog_face_detection = 'resnet101-face-detection-cvpr22papermogface'
+    mtcnn_face_detection = 'manual-face-detection-mtcnn'
     live_category = 'live-category'
     general_image_classification = 'vit-base_image-classification_ImageNet-labels'
     daily_image_classification = 'vit-base_image-classification_Dailylife-labels'
@@ -155,16 +171,19 @@ class Pipelines(object):
     text_driven_segmentation = 'text-driven-segmentation'
     movie_scene_segmentation = 'resnet50-bert-movie-scene-segmentation'
     shop_segmentation = 'shop-segmentation'
+    video_inpainting = 'video-inpainting'
 
     # nlp tasks
     sentence_similarity = 'sentence-similarity'
     word_segmentation = 'word-segmentation'
+    part_of_speech = 'part-of-speech'
     named_entity_recognition = 'named-entity-recognition'
     text_generation = 'text-generation'
     sentiment_analysis = 'sentiment-analysis'
     sentiment_classification = 'sentiment-classification'
     text_classification = 'text-classification'
     fill_mask = 'fill-mask'
+    fill_mask_ponet = 'fill-mask-ponet'
     csanmt_translation = 'csanmt-translation'
     nli = 'nli'
     dialog_intent_prediction = 'dialog-intent-prediction'
@@ -172,8 +191,12 @@ class Pipelines(object):
     dialog_state_tracking = 'dialog-state-tracking'
     zero_shot_classification = 'zero-shot-classification'
     text_error_correction = 'text-error-correction'
+    plug_generation = 'plug-generation'
     faq_question_answering = 'faq-question-answering'
     conversational_text_to_sql = 'conversational-text-to-sql'
+    table_question_answering_pipeline = 'table-question-answering-pipeline'
+    sentence_embedding = 'sentence-embedding'
+    passage_ranking = 'passage-ranking'
     relation_extraction = 'relation-extraction'
     document_segmentation = 'document-segmentation'
 
@@ -223,8 +246,11 @@ class Trainers(object):
 
     # nlp trainers
     bert_sentiment_analysis = 'bert-sentiment-analysis'
+    dialog_modeling_trainer = 'dialog-modeling-trainer'
+    dialog_intent_trainer = 'dialog-intent-trainer'
     nlp_base_trainer = 'nlp-base-trainer'
     nlp_veco_trainer = 'nlp-veco-trainer'
+    nlp_passage_ranking_trainer = 'nlp-passage-ranking-trainer'
 
     # audio trainers
     speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k'
@@ -252,6 +278,7 @@ class Preprocessors(object):
 
     # nlp preprocessor
     sen_sim_tokenizer = 'sen-sim-tokenizer'
+    cross_encoder_tokenizer = 'cross-encoder-tokenizer'
     bert_seq_cls_tokenizer = 'bert-seq-cls-tokenizer'
     text_gen_tokenizer = 'text-gen-tokenizer'
     token_cls_tokenizer = 'token-cls-tokenizer'
@@ -264,10 +291,15 @@ class Preprocessors(object):
     sbert_token_cls_tokenizer = 'sbert-token-cls-tokenizer'
     zero_shot_cls_tokenizer = 'zero-shot-cls-tokenizer'
     text_error_correction = 'text-error-correction'
+    sentence_embedding = 'sentence-embedding'
+    passage_ranking = 'passage-ranking'
+    sequence_labeling_tokenizer = 'sequence-labeling-tokenizer'
     word_segment_text_to_label_preprocessor = 'word-segment-text-to-label-preprocessor'
     fill_mask = 'fill-mask'
+    fill_mask_ponet = 'fill-mask-ponet'
     faq_question_answering_preprocessor = 'faq-question-answering-preprocessor'
     conversational_text_to_sql = 'conversational-text-to-sql'
+    table_question_answering_preprocessor = 'table-question-answering-preprocessor'
     re_tokenizer = 're-tokenizer'
     document_segmentation = 'document-segmentation'
 
diff --git a/modelscope/metrics/audio_noise_metric.py b/modelscope/metrics/audio_noise_metric.py
index 16c5261f..f26db46d 100644
--- a/modelscope/metrics/audio_noise_metric.py
+++ b/modelscope/metrics/audio_noise_metric.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Dict
 
 from modelscope.metainfo import Metrics
diff --git a/modelscope/metrics/sequence_classification_metric.py b/modelscope/metrics/sequence_classification_metric.py
index 83cb39ca..d795d8a2 100644
--- a/modelscope/metrics/sequence_classification_metric.py
+++ b/modelscope/metrics/sequence_classification_metric.py
@@ -14,9 +14,9 @@ from .builder import METRICS, MetricKeys
 @METRICS.register_module(
     group_key=default_group, module_name=Metrics.seq_cls_metric)
 class SequenceClassificationMetric(Metric):
-    """The metric computation class for sequence classification classes.
+    """The metric computation class for sequence classification tasks.
 
-    This metric class calculates accuracy for the whole input batches.
+    This metric class calculates accuracy of the whole input batches.
     """
 
     def __init__(self, *args, **kwargs):
diff --git a/modelscope/models/audio/aec/layers/activations.py b/modelscope/models/audio/aec/layers/activations.py
index b0215bcc..f78ad4b5 100644
--- a/modelscope/models/audio/aec/layers/activations.py
+++ b/modelscope/models/audio/aec/layers/activations.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import torch.nn as nn
 
 from .layer_base import LayerBase
diff --git a/modelscope/models/audio/aec/layers/affine_transform.py b/modelscope/models/audio/aec/layers/affine_transform.py
index 33479505..2de8a03f 100644
--- a/modelscope/models/audio/aec/layers/affine_transform.py
+++ b/modelscope/models/audio/aec/layers/affine_transform.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import numpy as np
 import torch as th
 import torch.nn as nn
diff --git a/modelscope/models/audio/aec/layers/deep_fsmn.py b/modelscope/models/audio/aec/layers/deep_fsmn.py
index 72ba07dc..1582b908 100644
--- a/modelscope/models/audio/aec/layers/deep_fsmn.py
+++ b/modelscope/models/audio/aec/layers/deep_fsmn.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import numpy as np
 import torch as th
 import torch.nn as nn
diff --git a/modelscope/models/audio/aec/layers/layer_base.py b/modelscope/models/audio/aec/layers/layer_base.py
index e56c4bc0..7c39e5be 100644
--- a/modelscope/models/audio/aec/layers/layer_base.py
+++ b/modelscope/models/audio/aec/layers/layer_base.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import abc
 import re
 
diff --git a/modelscope/models/audio/aec/layers/uni_deep_fsmn.py b/modelscope/models/audio/aec/layers/uni_deep_fsmn.py
index c22460c4..a276db05 100644
--- a/modelscope/models/audio/aec/layers/uni_deep_fsmn.py
+++ b/modelscope/models/audio/aec/layers/uni_deep_fsmn.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import numpy as np
 import torch as th
 import torch.nn as nn
diff --git a/modelscope/models/audio/aec/network/loss.py b/modelscope/models/audio/aec/network/loss.py
index 743661b3..1f20072a 100644
--- a/modelscope/models/audio/aec/network/loss.py
+++ b/modelscope/models/audio/aec/network/loss.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import torch
 import torch.nn.functional as F
 
diff --git a/modelscope/models/audio/aec/network/modulation_loss.py b/modelscope/models/audio/aec/network/modulation_loss.py
index a45ddead..3017b5c6 100644
--- a/modelscope/models/audio/aec/network/modulation_loss.py
+++ b/modelscope/models/audio/aec/network/modulation_loss.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import math
 
 import torch
diff --git a/modelscope/models/audio/aec/network/se_net.py b/modelscope/models/audio/aec/network/se_net.py
index 837cad3c..40639605 100644
--- a/modelscope/models/audio/aec/network/se_net.py
+++ b/modelscope/models/audio/aec/network/se_net.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
diff --git a/modelscope/models/audio/ans/complex_nn.py b/modelscope/models/audio/ans/complex_nn.py
index 9768eff7..beaa3187 100644
--- a/modelscope/models/audio/ans/complex_nn.py
+++ b/modelscope/models/audio/ans/complex_nn.py
@@ -1,9 +1,10 @@
-"""
-The implementation of class ComplexConv2d, ComplexConvTranspose2d and ComplexBatchNorm2d
- here is modified based on Jongho Choi(sweetcocoa@snu.ac.kr / Seoul National Univ., ESTsoft )
-and publicly available at https://github.com/sweetcocoa/DeepComplexUNetPyTorch
+# Copyright (c) Alibaba, Inc. and its affiliates.
+#
+# The implementation of class ComplexConv2d, ComplexConvTranspose2d and
+# ComplexBatchNorm2d here is modified based on Jongho Choi(sweetcocoa@snu.ac.kr
+# / Seoul National Univ., ESTsoft ) and publicly available at
+# https://github.com/sweetcocoa/DeepComplexUNetPyTorch
 
-"""
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
diff --git a/modelscope/models/audio/ans/unet.py b/modelscope/models/audio/ans/unet.py
index 3a9c5549..7b4df1e9 100644
--- a/modelscope/models/audio/ans/unet.py
+++ b/modelscope/models/audio/ans/unet.py
@@ -1,8 +1,10 @@
-"""
-The implementation here is modified based on
- Jongho Choi(sweetcocoa@snu.ac.kr / Seoul National Univ., ESTsoft )
-and publicly available at https://github.com/sweetcocoa/DeepComplexUNetPyTorch
-"""
+# Copyright (c) Alibaba, Inc. and its affiliates.
+#
+# The implementation here is modified based on
+# Jongho Choi(sweetcocoa@snu.ac.kr / Seoul National Univ., ESTsoft )
+# and publicly available at
+# https://github.com/sweetcocoa/DeepComplexUNetPyTorch
+
 import torch
 import torch.nn as nn
 
diff --git a/modelscope/models/audio/kws/farfield/fsmn.py b/modelscope/models/audio/kws/farfield/fsmn.py
index e88d3976..e06d7911 100644
--- a/modelscope/models/audio/kws/farfield/fsmn.py
+++ b/modelscope/models/audio/kws/farfield/fsmn.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import numpy as np
 import torch
 import torch.nn as nn
diff --git a/modelscope/models/audio/kws/farfield/fsmn_sele_v2.py b/modelscope/models/audio/kws/farfield/fsmn_sele_v2.py
index 1884e533..8af16cc9 100644
--- a/modelscope/models/audio/kws/farfield/fsmn_sele_v2.py
+++ b/modelscope/models/audio/kws/farfield/fsmn_sele_v2.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
diff --git a/modelscope/models/audio/kws/farfield/model.py b/modelscope/models/audio/kws/farfield/model.py
index 428ec367..fea82194 100644
--- a/modelscope/models/audio/kws/farfield/model.py
+++ b/modelscope/models/audio/kws/farfield/model.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 from typing import Dict
 
diff --git a/modelscope/models/audio/kws/farfield/model_def.py b/modelscope/models/audio/kws/farfield/model_def.py
index 3f5ba7d7..be9cca2c 100644
--- a/modelscope/models/audio/kws/farfield/model_def.py
+++ b/modelscope/models/audio/kws/farfield/model_def.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import math
 import struct
 from enum import Enum
diff --git a/modelscope/models/base/__init__.py b/modelscope/models/base/__init__.py
index ab7901af..8c47ecaf 100644
--- a/modelscope/models/base/__init__.py
+++ b/modelscope/models/base/__init__.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from .base_head import *  # noqa F403
 from .base_model import *  # noqa F403
 from .base_torch_head import *  # noqa F403
diff --git a/modelscope/models/base/base_head.py b/modelscope/models/base/base_head.py
index 07a68253..11bda32f 100644
--- a/modelscope/models/base/base_head.py
+++ b/modelscope/models/base/base_head.py
@@ -1,6 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from abc import ABC, abstractmethod
-from typing import Dict, Union
+from typing import Any, Dict, Union
 
 from modelscope.models.base.base_model import Model
 from modelscope.utils.config import ConfigDict
@@ -22,25 +22,20 @@ class Head(ABC):
         self.config = ConfigDict(kwargs)
 
     @abstractmethod
-    def forward(self, input: Input) -> Dict[str, Tensor]:
+    def forward(self, *args, **kwargs) -> Dict[str, Any]:
         """
         This method will use the output from backbone model to do any
-        downstream tasks
-        Args:
-            input: The tensor output or a model from backbone model
-            (text generation need a model as input)
-        Returns: The output from downstream taks
+        downstream tasks. Recieve The output from backbone model.
+
+        Returns (Dict[str, Any]): The output from downstream task.
         """
         pass
 
     @abstractmethod
-    def compute_loss(self, outputs: Dict[str, Tensor],
-                     labels) -> Dict[str, Tensor]:
+    def compute_loss(self, *args, **kwargs) -> Dict[str, Any]:
         """
-        compute loss for head during the finetuning
+        compute loss for head during the finetuning.
 
-        Args:
-            outputs (Dict[str, Tensor]):  the output from the model forward
-        Returns:  the loss(Dict[str, Tensor]):
+        Returns (Dict[str, Any]): The loss dict
         """
         pass
diff --git a/modelscope/models/base/base_model.py b/modelscope/models/base/base_model.py
index 872c42e8..cdc71fcf 100644
--- a/modelscope/models/base/base_model.py
+++ b/modelscope/models/base/base_model.py
@@ -2,7 +2,7 @@
 import os
 import os.path as osp
 from abc import ABC, abstractmethod
-from typing import Callable, Dict, List, Optional, Union
+from typing import Any, Callable, Dict, List, Optional, Union
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models.builder import build_model
@@ -10,8 +10,6 @@ from modelscope.utils.checkpoint import save_pretrained
 from modelscope.utils.config import Config
 from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile
 from modelscope.utils.device import device_placement, verify_device
-from modelscope.utils.file_utils import func_receive_dict_inputs
-from modelscope.utils.hub import parse_label_mapping
 from modelscope.utils.logger import get_logger
 
 logger = get_logger()
@@ -27,35 +25,31 @@ class Model(ABC):
         verify_device(device_name)
         self._device_name = device_name
 
-    def __call__(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
-        return self.postprocess(self.forward(input))
+    def __call__(self, *args, **kwargs) -> Dict[str, Any]:
+        return self.postprocess(self.forward(*args, **kwargs))
 
     @abstractmethod
-    def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
+    def forward(self, *args, **kwargs) -> Dict[str, Any]:
         """
         Run the forward pass for a model.
 
-        Args:
-            input (Dict[str, Tensor]): the dict of the model inputs for the forward method
-
         Returns:
-            Dict[str, Tensor]: output from the model forward pass
+            Dict[str, Any]: output from the model forward pass
         """
         pass
 
-    def postprocess(self, input: Dict[str, Tensor],
-                    **kwargs) -> Dict[str, Tensor]:
+    def postprocess(self, inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]:
         """ Model specific postprocess and convert model output to
         standard model outputs.
 
         Args:
-            input:  input data
+            inputs:  input data
 
         Return:
             dict of results:  a dict containing outputs of model, each
                 output should have the standard output name.
         """
-        return input
+        return inputs
 
     @classmethod
     def _instantiate(cls, **kwargs):
@@ -97,7 +91,6 @@ class Model(ABC):
                 osp.join(local_model_dir, ModelFile.CONFIGURATION))
         task_name = cfg.task
         model_cfg = cfg.model
-        framework = cfg.framework
 
         if hasattr(model_cfg, 'model_type') and not hasattr(model_cfg, 'type'):
             model_cfg.type = model_cfg.model_type
@@ -107,9 +100,8 @@ class Model(ABC):
             model_cfg[k] = v
         if device is not None:
             model_cfg.device = device
-            with device_placement(framework, device):
-                model = build_model(
-                    model_cfg, task_name=task_name, default_args=kwargs)
+            model = build_model(
+                model_cfg, task_name=task_name, default_args=kwargs)
         else:
             model = build_model(
                 model_cfg, task_name=task_name, default_args=kwargs)
diff --git a/modelscope/models/base/base_torch_head.py b/modelscope/models/base/base_torch_head.py
index c5a78519..faee4296 100644
--- a/modelscope/models/base/base_torch_head.py
+++ b/modelscope/models/base/base_torch_head.py
@@ -1,5 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from typing import Dict
+from typing import Any, Dict
 
 import torch
 
@@ -18,10 +18,8 @@ class TorchHead(Head, torch.nn.Module):
         super().__init__(**kwargs)
         torch.nn.Module.__init__(self)
 
-    def forward(self, inputs: Dict[str,
-                                   torch.Tensor]) -> Dict[str, torch.Tensor]:
+    def forward(self, *args, **kwargs) -> Dict[str, Any]:
         raise NotImplementedError
 
-    def compute_loss(self, outputs: Dict[str, torch.Tensor],
-                     labels) -> Dict[str, torch.Tensor]:
+    def compute_loss(self, *args, **kwargs) -> Dict[str, Any]:
         raise NotImplementedError
diff --git a/modelscope/models/base/base_torch_model.py b/modelscope/models/base/base_torch_model.py
index cfc88721..3c99a1f2 100644
--- a/modelscope/models/base/base_torch_model.py
+++ b/modelscope/models/base/base_torch_model.py
@@ -1,6 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
-from typing import Any, Dict, Optional, Union
+from typing import Any, Dict
 
 import torch
 from torch import nn
@@ -21,15 +21,14 @@ class TorchModel(Model, torch.nn.Module):
         super().__init__(model_dir, *args, **kwargs)
         torch.nn.Module.__init__(self)
 
-    def __call__(self, input: Dict[str,
-                                   torch.Tensor]) -> Dict[str, torch.Tensor]:
+    def __call__(self, *args, **kwargs) -> Dict[str, Any]:
+        # Adapting a model with only one dict arg, and the arg name must be input or inputs
         if func_receive_dict_inputs(self.forward):
-            return self.postprocess(self.forward(input))
+            return self.postprocess(self.forward(args[0], **kwargs))
         else:
-            return self.postprocess(self.forward(**input))
+            return self.postprocess(self.forward(*args, **kwargs))
 
-    def forward(self, inputs: Dict[str,
-                                   torch.Tensor]) -> Dict[str, torch.Tensor]:
+    def forward(self, *args, **kwargs) -> Dict[str, Any]:
         raise NotImplementedError
 
     def post_init(self):
diff --git a/modelscope/models/cv/action_detection/action_detection_onnx.py b/modelscope/models/cv/action_detection/action_detection_onnx.py
index 3c171473..1c8be354 100644
--- a/modelscope/models/cv/action_detection/action_detection_onnx.py
+++ b/modelscope/models/cv/action_detection/action_detection_onnx.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 import os.path as osp
 import shutil
diff --git a/modelscope/models/cv/face_detection/__init__.py b/modelscope/models/cv/face_detection/__init__.py
index a3c47164..a2a845d2 100644
--- a/modelscope/models/cv/face_detection/__init__.py
+++ b/modelscope/models/cv/face_detection/__init__.py
@@ -4,11 +4,16 @@ from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
+    from .mogface import MogFaceDetector
+    from .mtcnn import MtcnnFaceDetector
     from .retinaface import RetinaFaceDetection
-
+    from .ulfd_slim import UlfdFaceDetector
 else:
     _import_structure = {
+        'ulfd_slim': ['UlfdFaceDetector'],
         'retinaface': ['RetinaFaceDetection'],
+        'mtcnn': ['MtcnnFaceDetector'],
+        'mogface': ['MogFaceDetector']
     }
 
     import sys
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/__init__.py b/modelscope/models/cv/face_detection/mmdet_patch/__init__.py
index 921bdc08..5a895582 100755
--- a/modelscope/models/cv/face_detection/mmdet_patch/__init__.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/__init__.py
@@ -1,5 +1,4 @@
 """
-mmdet_patch is based on
-https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet,
-all duplicate functions from official mmdetection are removed.
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet
 """
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/core/bbox/__init__.py b/modelscope/models/cv/face_detection/mmdet_patch/core/bbox/__init__.py
index 8375649c..cf1b7313 100644
--- a/modelscope/models/cv/face_detection/mmdet_patch/core/bbox/__init__.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/core/bbox/__init__.py
@@ -1,3 +1,7 @@
+"""
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/core/bbox
+"""
 from .transforms import bbox2result, distance2kps, kps2distance
 
 __all__ = ['bbox2result', 'distance2kps', 'kps2distance']
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/core/bbox/transforms.py b/modelscope/models/cv/face_detection/mmdet_patch/core/bbox/transforms.py
index 26278837..d65480eb 100755
--- a/modelscope/models/cv/face_detection/mmdet_patch/core/bbox/transforms.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/core/bbox/transforms.py
@@ -1,5 +1,6 @@
 """
-based on https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/core/bbox/transforms.py
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/core/bbox/transforms.py
 """
 import numpy as np
 import torch
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/core/post_processing/__init__.py b/modelscope/models/cv/face_detection/mmdet_patch/core/post_processing/__init__.py
index 8cd31348..61602fd3 100755
--- a/modelscope/models/cv/face_detection/mmdet_patch/core/post_processing/__init__.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/core/post_processing/__init__.py
@@ -1,3 +1,7 @@
+"""
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/core/post_processing/bbox_nms.py
+"""
 from .bbox_nms import multiclass_nms
 
 __all__ = ['multiclass_nms']
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/core/post_processing/bbox_nms.py b/modelscope/models/cv/face_detection/mmdet_patch/core/post_processing/bbox_nms.py
index efe8813f..7a4f5b3a 100644
--- a/modelscope/models/cv/face_detection/mmdet_patch/core/post_processing/bbox_nms.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/core/post_processing/bbox_nms.py
@@ -1,5 +1,6 @@
 """
-based on https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/core/post_processing/bbox_nms.py
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/core/post_processing/bbox_nms.py
 """
 import torch
 
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/datasets/__init__.py b/modelscope/models/cv/face_detection/mmdet_patch/datasets/__init__.py
index 07a45208..cea179b0 100644
--- a/modelscope/models/cv/face_detection/mmdet_patch/datasets/__init__.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/datasets/__init__.py
@@ -1,3 +1,7 @@
+"""
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets
+"""
 from .retinaface import RetinaFaceDataset
 
 __all__ = ['RetinaFaceDataset']
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/datasets/pipelines/__init__.py b/modelscope/models/cv/face_detection/mmdet_patch/datasets/pipelines/__init__.py
index 979212a3..85288910 100755
--- a/modelscope/models/cv/face_detection/mmdet_patch/datasets/pipelines/__init__.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/datasets/pipelines/__init__.py
@@ -1,3 +1,7 @@
+"""
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines
+"""
 from .transforms import RandomSquareCrop
 
 __all__ = ['RandomSquareCrop']
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/datasets/pipelines/transforms.py b/modelscope/models/cv/face_detection/mmdet_patch/datasets/pipelines/transforms.py
index 3048cefa..241f2c0e 100755
--- a/modelscope/models/cv/face_detection/mmdet_patch/datasets/pipelines/transforms.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/datasets/pipelines/transforms.py
@@ -1,5 +1,6 @@
 """
-based on https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines/transforms.py
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines/transforms.py
 """
 import numpy as np
 from mmdet.datasets.builder import PIPELINES
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/datasets/retinaface.py b/modelscope/models/cv/face_detection/mmdet_patch/datasets/retinaface.py
index bf20764b..bbacd9be 100755
--- a/modelscope/models/cv/face_detection/mmdet_patch/datasets/retinaface.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/datasets/retinaface.py
@@ -1,5 +1,6 @@
 """
-based on https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/retinaface.py
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/retinaface.py
 """
 import numpy as np
 from mmdet.datasets.builder import DATASETS
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/models/__init__.py b/modelscope/models/cv/face_detection/mmdet_patch/models/__init__.py
index 38c8ff5b..bd5d5f5f 100755
--- a/modelscope/models/cv/face_detection/mmdet_patch/models/__init__.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/models/__init__.py
@@ -1,2 +1,6 @@
+"""
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/models
+"""
 from .dense_heads import *  # noqa: F401,F403
 from .detectors import *  # noqa: F401,F403
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/models/backbones/__init__.py b/modelscope/models/cv/face_detection/mmdet_patch/models/backbones/__init__.py
index 2d930bf4..5c3b190e 100755
--- a/modelscope/models/cv/face_detection/mmdet_patch/models/backbones/__init__.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/models/backbones/__init__.py
@@ -1,3 +1,7 @@
+"""
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/models/backbones
+"""
 from .resnet import ResNetV1e
 
 __all__ = ['ResNetV1e']
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/models/backbones/resnet.py b/modelscope/models/cv/face_detection/mmdet_patch/models/backbones/resnet.py
index 54bcb127..a5862a58 100644
--- a/modelscope/models/cv/face_detection/mmdet_patch/models/backbones/resnet.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/models/backbones/resnet.py
@@ -1,5 +1,6 @@
 """
-based on https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/models/backbones/resnet.py
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/models/backbones/resnet.py
 """
 import torch.nn as nn
 import torch.utils.checkpoint as cp
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/models/dense_heads/__init__.py b/modelscope/models/cv/face_detection/mmdet_patch/models/dense_heads/__init__.py
index e67031bc..9ba63b68 100755
--- a/modelscope/models/cv/face_detection/mmdet_patch/models/dense_heads/__init__.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/models/dense_heads/__init__.py
@@ -1,3 +1,7 @@
+"""
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/models/dense_heads
+"""
 from .scrfd_head import SCRFDHead
 
 __all__ = ['SCRFDHead']
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/models/dense_heads/scrfd_head.py b/modelscope/models/cv/face_detection/mmdet_patch/models/dense_heads/scrfd_head.py
index 1667f29f..acc45670 100755
--- a/modelscope/models/cv/face_detection/mmdet_patch/models/dense_heads/scrfd_head.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/models/dense_heads/scrfd_head.py
@@ -1,5 +1,6 @@
 """
-based on https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/models/dense_heads/scrfd_head.py
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/models/dense_heads/scrfd_head.py
 """
 import numpy as np
 import torch
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/models/detectors/__init__.py b/modelscope/models/cv/face_detection/mmdet_patch/models/detectors/__init__.py
index 1c16028f..7935606a 100755
--- a/modelscope/models/cv/face_detection/mmdet_patch/models/detectors/__init__.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/models/detectors/__init__.py
@@ -1,3 +1,7 @@
+"""
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/models/detectors
+"""
 from .scrfd import SCRFD
 
 __all__ = ['SCRFD']
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/models/detectors/scrfd.py b/modelscope/models/cv/face_detection/mmdet_patch/models/detectors/scrfd.py
index 98b6702c..a5f5cac2 100755
--- a/modelscope/models/cv/face_detection/mmdet_patch/models/detectors/scrfd.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/models/detectors/scrfd.py
@@ -1,5 +1,6 @@
 """
-based on https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/models/detectors/scrfd.py
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/models/detectors/scrfd.py
 """
 import torch
 from mmdet.models.builder import DETECTORS
diff --git a/modelscope/models/cv/face_detection/mogface/__init__.py b/modelscope/models/cv/face_detection/mogface/__init__.py
new file mode 100644
index 00000000..8190b649
--- /dev/null
+++ b/modelscope/models/cv/face_detection/mogface/__init__.py
@@ -0,0 +1 @@
+from .models.detectors import MogFaceDetector
diff --git a/modelscope/models/cv/face_detection/mogface/models/__init__.py b/modelscope/models/cv/face_detection/mogface/models/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/face_detection/mogface/models/detectors.py b/modelscope/models/cv/face_detection/mogface/models/detectors.py
new file mode 100644
index 00000000..5ae67104
--- /dev/null
+++ b/modelscope/models/cv/face_detection/mogface/models/detectors.py
@@ -0,0 +1,96 @@
+import os
+
+import cv2
+import numpy as np
+import torch
+import torch.backends.cudnn as cudnn
+
+from modelscope.metainfo import Models
+from modelscope.models.base import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import Tasks
+from .mogface import MogFace
+from .utils import MogPriorBox, mogdecode, py_cpu_nms
+
+
+@MODELS.register_module(Tasks.face_detection, module_name=Models.mogface)
+class MogFaceDetector(TorchModel):
+
+    def __init__(self, model_path, device='cuda'):
+        super().__init__(model_path)
+        torch.set_grad_enabled(False)
+        cudnn.benchmark = True
+        self.model_path = model_path
+        self.device = device
+        self.net = MogFace()
+        self.load_model()
+        self.net = self.net.to(device)
+
+        self.mean = np.array([[104, 117, 123]])
+
+    def load_model(self, load_to_cpu=False):
+        pretrained_dict = torch.load(
+            self.model_path, map_location=torch.device('cpu'))
+        self.net.load_state_dict(pretrained_dict, strict=False)
+        self.net.eval()
+
+    def forward(self, input):
+        img_raw = input['img']
+        img = np.array(img_raw.cpu().detach())
+        img = img[:, :, ::-1]
+
+        im_height, im_width = img.shape[:2]
+        ss = 1.0
+        # tricky
+        if max(im_height, im_width) > 1500:
+            ss = 1000.0 / max(im_height, im_width)
+            img = cv2.resize(img, (0, 0), fx=ss, fy=ss)
+            im_height, im_width = img.shape[:2]
+
+        scale = torch.Tensor(
+            [img.shape[1], img.shape[0], img.shape[1], img.shape[0]])
+        img -= np.array([[103.53, 116.28, 123.675]])
+        img /= np.array([[57.375, 57.120003, 58.395]])
+        img /= 255
+        img = img[:, :, ::-1].copy()
+        img = img.transpose(2, 0, 1)
+        img = torch.from_numpy(img).unsqueeze(0)
+        img = img.to(self.device)
+        scale = scale.to(self.device)
+
+        conf, loc = self.net(img)  # forward pass
+
+        confidence_threshold = 0.82
+        nms_threshold = 0.4
+        top_k = 5000
+        keep_top_k = 750
+
+        priorbox = MogPriorBox(scale_list=[0.68])
+        priors = priorbox(im_height, im_width)
+        priors = torch.tensor(priors).to(self.device)
+        prior_data = priors.data
+
+        boxes = mogdecode(loc.data.squeeze(0), prior_data)
+        boxes = boxes.cpu().numpy()
+        scores = conf.squeeze(0).data.cpu().numpy()[:, 0]
+
+        # ignore low scores
+        inds = np.where(scores > confidence_threshold)[0]
+        boxes = boxes[inds]
+        scores = scores[inds]
+
+        # keep top-K before NMS
+        order = scores.argsort()[::-1][:top_k]
+        boxes = boxes[order]
+        scores = scores[order]
+
+        # do NMS
+        dets = np.hstack((boxes, scores[:, np.newaxis])).astype(
+            np.float32, copy=False)
+        keep = py_cpu_nms(dets, nms_threshold)
+        dets = dets[keep, :]
+
+        # keep top-K faster NMS
+        dets = dets[:keep_top_k, :]
+
+        return dets / ss
diff --git a/modelscope/models/cv/face_detection/mogface/models/mogface.py b/modelscope/models/cv/face_detection/mogface/models/mogface.py
new file mode 100644
index 00000000..294c2c6b
--- /dev/null
+++ b/modelscope/models/cv/face_detection/mogface/models/mogface.py
@@ -0,0 +1,135 @@
+# --------------------------------------------------------
+# The implementation is also open-sourced by the authors as Yang Liu, and is available publicly on
+# https://github.com/damo-cv/MogFace
+# --------------------------------------------------------
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .mogprednet import MogPredNet
+from .resnet import ResNet
+
+
+class MogFace(nn.Module):
+
+    def __init__(self):
+        super(MogFace, self).__init__()
+        self.backbone = ResNet(depth=101)
+        self.fpn = LFPN()
+        self.pred_net = MogPredNet()
+
+    def forward(self, x):
+        feature_list = self.backbone(x)
+        fpn_list = self.fpn(feature_list)
+        pyramid_feature_list = fpn_list[0]
+        conf, loc = self.pred_net(pyramid_feature_list)
+        return conf, loc
+
+
+class FeatureFusion(nn.Module):
+
+    def __init__(self, lat_ch=256, **channels):
+        super(FeatureFusion, self).__init__()
+        self.main_conv = nn.Conv2d(channels['main'], lat_ch, kernel_size=1)
+
+    def forward(self, up, main):
+        main = self.main_conv(main)
+        _, _, H, W = main.size()
+        res = F.upsample(up, scale_factor=2, mode='bilinear')
+        if res.size(2) != main.size(2) or res.size(3) != main.size(3):
+            res = res[:, :, 0:H, 0:W]
+        res = res + main
+        return res
+
+
+class LFPN(nn.Module):
+
+    def __init__(self,
+                 c2_out_ch=256,
+                 c3_out_ch=512,
+                 c4_out_ch=1024,
+                 c5_out_ch=2048,
+                 c6_mid_ch=512,
+                 c6_out_ch=512,
+                 c7_mid_ch=128,
+                 c7_out_ch=256,
+                 out_dsfd_ft=True):
+        super(LFPN, self).__init__()
+        self.out_dsfd_ft = out_dsfd_ft
+        if self.out_dsfd_ft:
+            dsfd_module = []
+            dsfd_module.append(nn.Conv2d(256, 256, kernel_size=3, padding=1))
+            dsfd_module.append(nn.Conv2d(512, 256, kernel_size=3, padding=1))
+            dsfd_module.append(nn.Conv2d(1024, 256, kernel_size=3, padding=1))
+            dsfd_module.append(nn.Conv2d(2048, 256, kernel_size=3, padding=1))
+            dsfd_module.append(nn.Conv2d(256, 256, kernel_size=3, padding=1))
+            dsfd_module.append(nn.Conv2d(256, 256, kernel_size=3, padding=1))
+            self.dsfd_modules = nn.ModuleList(dsfd_module)
+
+        c6_input_ch = c5_out_ch
+        self.c6 = nn.Sequential(*[
+            nn.Conv2d(
+                c6_input_ch,
+                c6_mid_ch,
+                kernel_size=1,
+            ),
+            nn.BatchNorm2d(c6_mid_ch),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(
+                c6_mid_ch, c6_out_ch, kernel_size=3, padding=1, stride=2),
+            nn.BatchNorm2d(c6_out_ch),
+            nn.ReLU(inplace=True)
+        ])
+        self.c7 = nn.Sequential(*[
+            nn.Conv2d(
+                c6_out_ch,
+                c7_mid_ch,
+                kernel_size=1,
+            ),
+            nn.BatchNorm2d(c7_mid_ch),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(
+                c7_mid_ch, c7_out_ch, kernel_size=3, padding=1, stride=2),
+            nn.BatchNorm2d(c7_out_ch),
+            nn.ReLU(inplace=True)
+        ])
+
+        self.p2_lat = nn.Conv2d(256, 256, kernel_size=3, padding=1)
+        self.p3_lat = nn.Conv2d(256, 256, kernel_size=3, padding=1)
+        self.p4_lat = nn.Conv2d(256, 256, kernel_size=3, padding=1)
+
+        self.c5_lat = nn.Conv2d(c6_input_ch, 256, kernel_size=3, padding=1)
+        self.c6_lat = nn.Conv2d(c6_out_ch, 256, kernel_size=3, padding=1)
+        self.c7_lat = nn.Conv2d(c7_out_ch, 256, kernel_size=3, padding=1)
+
+        self.ff_c5_c4 = FeatureFusion(main=c4_out_ch)
+        self.ff_c4_c3 = FeatureFusion(main=c3_out_ch)
+        self.ff_c3_c2 = FeatureFusion(main=c2_out_ch)
+
+    def forward(self, feature_list):
+        c2, c3, c4, c5 = feature_list
+        c6 = self.c6(c5)
+        c7 = self.c7(c6)
+
+        c5 = self.c5_lat(c5)
+        c6 = self.c6_lat(c6)
+        c7 = self.c7_lat(c7)
+
+        if self.out_dsfd_ft:
+            dsfd_fts = []
+            dsfd_fts.append(self.dsfd_modules[0](c2))
+            dsfd_fts.append(self.dsfd_modules[1](c3))
+            dsfd_fts.append(self.dsfd_modules[2](c4))
+            dsfd_fts.append(self.dsfd_modules[3](feature_list[-1]))
+            dsfd_fts.append(self.dsfd_modules[4](c6))
+            dsfd_fts.append(self.dsfd_modules[5](c7))
+
+        p4 = self.ff_c5_c4(c5, c4)
+        p3 = self.ff_c4_c3(p4, c3)
+        p2 = self.ff_c3_c2(p3, c2)
+
+        p2 = self.p2_lat(p2)
+        p3 = self.p3_lat(p3)
+        p4 = self.p4_lat(p4)
+
+        if self.out_dsfd_ft:
+            return ([p2, p3, p4, c5, c6, c7], dsfd_fts)
diff --git a/modelscope/models/cv/face_detection/mogface/models/mogprednet.py b/modelscope/models/cv/face_detection/mogface/models/mogprednet.py
new file mode 100644
index 00000000..31384976
--- /dev/null
+++ b/modelscope/models/cv/face_detection/mogface/models/mogprednet.py
@@ -0,0 +1,164 @@
+# --------------------------------------------------------
+# The implementation is also open-sourced by the authors as Yang Liu, and is available publicly on
+# https://github.com/damo-cv/MogFace
+# --------------------------------------------------------
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class conv_bn(nn.Module):
+    """docstring for conv"""
+
+    def __init__(self, in_plane, out_plane, kernel_size, stride, padding):
+        super(conv_bn, self).__init__()
+        self.conv1 = nn.Conv2d(
+            in_plane,
+            out_plane,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding)
+        self.bn1 = nn.BatchNorm2d(out_plane)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        return self.bn1(x)
+
+
+class SSHContext(nn.Module):
+
+    def __init__(self, channels, Xchannels=256):
+        super(SSHContext, self).__init__()
+
+        self.conv1 = nn.Conv2d(
+            channels, Xchannels, kernel_size=3, stride=1, padding=1)
+        self.conv2 = nn.Conv2d(
+            channels,
+            Xchannels // 2,
+            kernel_size=3,
+            dilation=2,
+            stride=1,
+            padding=2)
+        self.conv2_1 = nn.Conv2d(
+            Xchannels // 2, Xchannels // 2, kernel_size=3, stride=1, padding=1)
+        self.conv2_2 = nn.Conv2d(
+            Xchannels // 2,
+            Xchannels // 2,
+            kernel_size=3,
+            dilation=2,
+            stride=1,
+            padding=2)
+        self.conv2_2_1 = nn.Conv2d(
+            Xchannels // 2, Xchannels // 2, kernel_size=3, stride=1, padding=1)
+
+    def forward(self, x):
+        x1 = F.relu(self.conv1(x), inplace=True)
+        x2 = F.relu(self.conv2(x), inplace=True)
+        x2_1 = F.relu(self.conv2_1(x2), inplace=True)
+        x2_2 = F.relu(self.conv2_2(x2), inplace=True)
+        x2_2 = F.relu(self.conv2_2_1(x2_2), inplace=True)
+
+        return torch.cat([x1, x2_1, x2_2], 1)
+
+
+class DeepHead(nn.Module):
+
+    def __init__(self,
+                 in_channel=256,
+                 out_channel=256,
+                 use_gn=False,
+                 num_conv=4):
+        super(DeepHead, self).__init__()
+        self.use_gn = use_gn
+        self.num_conv = num_conv
+        self.conv1 = nn.Conv2d(in_channel, out_channel, 3, 1, 1)
+        self.conv2 = nn.Conv2d(out_channel, out_channel, 3, 1, 1)
+        self.conv3 = nn.Conv2d(out_channel, out_channel, 3, 1, 1)
+        self.conv4 = nn.Conv2d(out_channel, out_channel, 3, 1, 1)
+        if self.use_gn:
+            self.gn1 = nn.GroupNorm(16, out_channel)
+            self.gn2 = nn.GroupNorm(16, out_channel)
+            self.gn3 = nn.GroupNorm(16, out_channel)
+            self.gn4 = nn.GroupNorm(16, out_channel)
+
+    def forward(self, x):
+        if self.use_gn:
+            x1 = F.relu(self.gn1(self.conv1(x)), inplace=True)
+            x2 = F.relu(self.gn2(self.conv1(x1)), inplace=True)
+            x3 = F.relu(self.gn3(self.conv1(x2)), inplace=True)
+            x4 = F.relu(self.gn4(self.conv1(x3)), inplace=True)
+        else:
+            x1 = F.relu(self.conv1(x), inplace=True)
+            x2 = F.relu(self.conv1(x1), inplace=True)
+            if self.num_conv == 2:
+                return x2
+            x3 = F.relu(self.conv1(x2), inplace=True)
+            x4 = F.relu(self.conv1(x3), inplace=True)
+
+        return x4
+
+
+class MogPredNet(nn.Module):
+
+    def __init__(self,
+                 num_anchor_per_pixel=1,
+                 num_classes=1,
+                 input_ch_list=[256, 256, 256, 256, 256, 256],
+                 use_deep_head=True,
+                 deep_head_with_gn=True,
+                 use_ssh=True,
+                 deep_head_ch=512):
+        super(MogPredNet, self).__init__()
+        self.num_classes = num_classes
+        self.use_deep_head = use_deep_head
+        self.deep_head_with_gn = deep_head_with_gn
+
+        self.use_ssh = use_ssh
+
+        self.deep_head_ch = deep_head_ch
+
+        if self.use_ssh:
+            self.conv_SSH = SSHContext(input_ch_list[0],
+                                       self.deep_head_ch // 2)
+
+        if self.use_deep_head:
+            if self.deep_head_with_gn:
+                self.deep_loc_head = DeepHead(
+                    self.deep_head_ch, self.deep_head_ch, use_gn=True)
+                self.deep_cls_head = DeepHead(
+                    self.deep_head_ch, self.deep_head_ch, use_gn=True)
+
+            self.pred_cls = nn.Conv2d(self.deep_head_ch,
+                                      1 * num_anchor_per_pixel, 3, 1, 1)
+            self.pred_loc = nn.Conv2d(self.deep_head_ch,
+                                      4 * num_anchor_per_pixel, 3, 1, 1)
+
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, pyramid_feature_list, dsfd_ft_list=None):
+        loc = []
+        conf = []
+
+        if self.use_deep_head:
+            for x in pyramid_feature_list:
+                if self.use_ssh:
+                    x = self.conv_SSH(x)
+                x_cls = self.deep_cls_head(x)
+                x_loc = self.deep_loc_head(x)
+
+                conf.append(
+                    self.pred_cls(x_cls).permute(0, 2, 3, 1).contiguous())
+                loc.append(
+                    self.pred_loc(x_loc).permute(0, 2, 3, 1).contiguous())
+
+        loc = torch.cat([o.view(o.size(0), -1, 4) for o in loc], 1)
+        conf = torch.cat(
+            [o.view(o.size(0), -1, self.num_classes) for o in conf], 1)
+        output = (
+            self.sigmoid(conf.view(conf.size(0), -1, self.num_classes)),
+            loc.view(loc.size(0), -1, 4),
+        )
+
+        return output
diff --git a/modelscope/models/cv/face_detection/mogface/models/resnet.py b/modelscope/models/cv/face_detection/mogface/models/resnet.py
new file mode 100644
index 00000000..045f6fa3
--- /dev/null
+++ b/modelscope/models/cv/face_detection/mogface/models/resnet.py
@@ -0,0 +1,193 @@
+# The implementation is modified from original resent implementaiton, which is
+#  also open-sourced by the authors as Yang Liu,
+#  and is available publicly on  https://github.com/damo-cv/MogFace
+
+import torch.nn as nn
+
+
+def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(
+        in_planes,
+        out_planes,
+        kernel_size=3,
+        stride=stride,
+        padding=dilation,
+        groups=groups,
+        bias=False,
+        dilation=dilation)
+
+
+def conv1x1(in_planes, out_planes, stride=1):
+    """1x1 convolution"""
+    return nn.Conv2d(
+        in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 downsample=None,
+                 groups=1,
+                 base_width=64,
+                 dilation=1,
+                 norm_layer=None):
+        super(Bottleneck, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        width = int(planes * (base_width / 64.)) * groups
+        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv1x1(inplanes, width)
+        self.bn1 = norm_layer(width)
+        self.conv2 = conv3x3(width, width, stride, groups, dilation)
+        self.bn2 = norm_layer(width)
+        self.conv3 = conv1x1(width, planes * self.expansion)
+        self.bn3 = norm_layer(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+class ResNet(nn.Module):
+
+    def __init__(self,
+                 depth=50,
+                 groups=1,
+                 width_per_group=64,
+                 replace_stride_with_dilation=None,
+                 norm_layer=None,
+                 inplanes=64,
+                 shrink_ch_ratio=1):
+        super(ResNet, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        self._norm_layer = norm_layer
+
+        if depth == 50:
+            block = Bottleneck
+            layers = [3, 4, 6, 3]
+        elif depth == 101:
+            block = Bottleneck
+            layers = [3, 4, 23, 3]
+        elif depth == 152:
+            block = Bottleneck
+            layers = [3, 4, 36, 3]
+        elif depth == 18:
+            block = BasicBlock
+            layers = [2, 2, 2, 2]
+        else:
+            raise ValueError('only support depth in [18, 50, 101, 152]')
+
+        shrink_input_ch = int(inplanes * shrink_ch_ratio)
+        self.inplanes = int(inplanes * shrink_ch_ratio)
+        if shrink_ch_ratio == 0.125:
+            layers = [2, 3, 3, 3]
+
+        self.dilation = 1
+        if replace_stride_with_dilation is None:
+            # each element in the tuple indicates if we should replace
+            # the 2x2 stride with a dilated convolution instead
+            replace_stride_with_dilation = [False, False, False]
+        if len(replace_stride_with_dilation) != 3:
+            raise ValueError('replace_stride_with_dilation should be None '
+                             'or a 3-element tuple, got {}'.format(
+                                 replace_stride_with_dilation))
+        self.groups = groups
+        self.base_width = width_per_group
+        self.conv1 = nn.Conv2d(
+            3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False)
+        self.bn1 = norm_layer(self.inplanes)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, shrink_input_ch, layers[0])
+        self.layer2 = self._make_layer(
+            block,
+            shrink_input_ch * 2,
+            layers[1],
+            stride=2,
+            dilate=replace_stride_with_dilation[0])
+        self.layer3 = self._make_layer(
+            block,
+            shrink_input_ch * 4,
+            layers[2],
+            stride=2,
+            dilate=replace_stride_with_dilation[1])
+        self.layer4 = self._make_layer(
+            block,
+            shrink_input_ch * 8,
+            layers[3],
+            stride=2,
+            dilate=replace_stride_with_dilation[2])
+
+    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
+        norm_layer = self._norm_layer
+        downsample = None
+        previous_dilation = self.dilation
+        if dilate:
+            self.dilation *= stride
+            stride = 1
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                conv1x1(self.inplanes, planes * block.expansion, stride),
+                norm_layer(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(
+            block(self.inplanes, planes, stride, downsample, self.groups,
+                  self.base_width, previous_dilation, norm_layer))
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(
+                block(
+                    self.inplanes,
+                    planes,
+                    groups=self.groups,
+                    base_width=self.base_width,
+                    dilation=self.dilation,
+                    norm_layer=norm_layer))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+        four_conv_layer = []
+        x = self.layer1(x)
+        four_conv_layer.append(x)
+        x = self.layer2(x)
+        four_conv_layer.append(x)
+        x = self.layer3(x)
+        four_conv_layer.append(x)
+        x = self.layer4(x)
+        four_conv_layer.append(x)
+
+        return four_conv_layer
diff --git a/modelscope/models/cv/face_detection/mogface/models/utils.py b/modelscope/models/cv/face_detection/mogface/models/utils.py
new file mode 100755
index 00000000..377ceb3d
--- /dev/null
+++ b/modelscope/models/cv/face_detection/mogface/models/utils.py
@@ -0,0 +1,212 @@
+# Modified from https://github.com/biubug6/Pytorch_Retinaface
+
+import math
+from itertools import product as product
+from math import ceil
+
+import numpy as np
+import torch
+
+
+def transform_anchor(anchors):
+    """
+    from [x0, x1, y0, y1] to [c_x, cy, w, h]
+    x1 = x0 + w - 1
+    c_x = (x0 + x1) / 2 = (2x0 + w - 1) / 2 = x0 + (w - 1) / 2
+    """
+    return np.concatenate(((anchors[:, :2] + anchors[:, 2:]) / 2,
+                           anchors[:, 2:] - anchors[:, :2] + 1),
+                          axis=1)
+
+
+def normalize_anchor(anchors):
+    """
+    from  [c_x, cy, w, h] to [x0, x1, y0, y1]
+    """
+    item_1 = anchors[:, :2] - (anchors[:, 2:] - 1) / 2
+    item_2 = anchors[:, :2] + (anchors[:, 2:] - 1) / 2
+    return np.concatenate((item_1, item_2), axis=1)
+
+
+class MogPriorBox(object):
+    """
+    both for fpn and single layer, single layer need to test
+    return (np.array) [num_anchros, 4] [x0, y0, x1, y1]
+    """
+
+    def __init__(self,
+                 scale_list=[1.],
+                 aspect_ratio_list=[1.0],
+                 stride_list=[4, 8, 16, 32, 64, 128],
+                 anchor_size_list=[16, 32, 64, 128, 256, 512]):
+        self.scale_list = scale_list
+        self.aspect_ratio_list = aspect_ratio_list
+        self.stride_list = stride_list
+        self.anchor_size_list = anchor_size_list
+
+    def __call__(self, img_height, img_width):
+        final_anchor_list = []
+
+        for idx, stride in enumerate(self.stride_list):
+            anchor_list = []
+            cur_img_height = img_height
+            cur_img_width = img_width
+            tmp_stride = stride
+
+            while tmp_stride != 1:
+                tmp_stride = tmp_stride // 2
+                cur_img_height = (cur_img_height + 1) // 2
+                cur_img_width = (cur_img_width + 1) // 2
+
+            for i in range(cur_img_height):
+                for j in range(cur_img_width):
+                    for scale in self.scale_list:
+                        cx = (j + 0.5) * stride
+                        cy = (i + 0.5) * stride
+                        side_x = self.anchor_size_list[idx] * scale
+                        side_y = self.anchor_size_list[idx] * scale
+                        for ratio in self.aspect_ratio_list:
+                            anchor_list.append([
+                                cx, cy, side_x / math.sqrt(ratio),
+                                side_y * math.sqrt(ratio)
+                            ])
+
+            final_anchor_list.append(anchor_list)
+        final_anchor_arr = np.concatenate(final_anchor_list, axis=0)
+        normalized_anchor_arr = normalize_anchor(final_anchor_arr).astype(
+            'float32')
+        transformed_anchor = transform_anchor(normalized_anchor_arr)
+
+        return transformed_anchor
+
+
+class PriorBox(object):
+
+    def __init__(self, cfg, image_size=None, phase='train'):
+        super(PriorBox, self).__init__()
+        self.min_sizes = cfg['min_sizes']
+        self.steps = cfg['steps']
+        self.clip = cfg['clip']
+        self.image_size = image_size
+        self.feature_maps = [[
+            ceil(self.image_size[0] / step),
+            ceil(self.image_size[1] / step)
+        ] for step in self.steps]
+        self.name = 's'
+
+    def forward(self):
+        anchors = []
+        for k, f in enumerate(self.feature_maps):
+            min_sizes = self.min_sizes[k]
+            for i, j in product(range(f[0]), range(f[1])):
+                for min_size in min_sizes:
+                    s_kx = min_size / self.image_size[1]
+                    s_ky = min_size / self.image_size[0]
+                    dense_cx = [
+                        x * self.steps[k] / self.image_size[1]
+                        for x in [j + 0.5]
+                    ]
+                    dense_cy = [
+                        y * self.steps[k] / self.image_size[0]
+                        for y in [i + 0.5]
+                    ]
+                    for cy, cx in product(dense_cy, dense_cx):
+                        anchors += [cx, cy, s_kx, s_ky]
+
+        # back to torch land
+        output = torch.Tensor(anchors).view(-1, 4)
+        if self.clip:
+            output.clamp_(max=1, min=0)
+        return output
+
+
+def py_cpu_nms(dets, thresh):
+    """Pure Python NMS baseline."""
+    x1 = dets[:, 0]
+    y1 = dets[:, 1]
+    x2 = dets[:, 2]
+    y2 = dets[:, 3]
+    scores = dets[:, 4]
+
+    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+    order = scores.argsort()[::-1]
+
+    keep = []
+    while order.size > 0:
+        i = order[0]
+        keep.append(i)
+        xx1 = np.maximum(x1[i], x1[order[1:]])
+        yy1 = np.maximum(y1[i], y1[order[1:]])
+        xx2 = np.minimum(x2[i], x2[order[1:]])
+        yy2 = np.minimum(y2[i], y2[order[1:]])
+
+        w = np.maximum(0.0, xx2 - xx1 + 1)
+        h = np.maximum(0.0, yy2 - yy1 + 1)
+        inter = w * h
+        ovr = inter / (areas[i] + areas[order[1:]] - inter)
+
+        inds = np.where(ovr <= thresh)[0]
+        order = order[inds + 1]
+
+    return keep
+
+
+def mogdecode(loc, anchors):
+    """
+    loc: torch.Tensor
+    anchors: 2-d, torch.Tensor (cx, cy, w, h)
+    boxes: 2-d, torch.Tensor (x0, y0, x1, y1)
+    """
+
+    boxes = torch.cat((anchors[:, :2] + loc[:, :2] * anchors[:, 2:],
+                       anchors[:, 2:] * torch.exp(loc[:, 2:])), 1)
+
+    boxes[:, 0] -= (boxes[:, 2] - 1) / 2
+    boxes[:, 1] -= (boxes[:, 3] - 1) / 2
+    boxes[:, 2] += boxes[:, 0] - 1
+    boxes[:, 3] += boxes[:, 1] - 1
+
+    return boxes
+
+
+# Adapted from https://github.com/Hakuyume/chainer-ssd
+def decode(loc, priors, variances):
+    """Decode locations from predictions using priors to undo
+    the encoding we did for offset regression at train time.
+    Args:
+        loc (tensor): location predictions for loc layers,
+            Shape: [num_priors,4]
+        priors (tensor): Prior boxes in center-offset form.
+            Shape: [num_priors,4].
+        variances: (list[float]) Variances of priorboxes
+    Return:
+        decoded bounding box predictions
+    """
+
+    boxes = torch.cat(
+        (priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:],
+         priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
+    boxes[:, :2] -= boxes[:, 2:] / 2
+    boxes[:, 2:] += boxes[:, :2]
+    return boxes
+
+
+def decode_landm(pre, priors, variances):
+    """Decode landm from predictions using priors to undo
+    the encoding we did for offset regression at train time.
+    Args:
+        pre (tensor): landm predictions for loc layers,
+            Shape: [num_priors,10]
+        priors (tensor): Prior boxes in center-offset form.
+            Shape: [num_priors,4].
+        variances: (list[float]) Variances of priorboxes
+    Return:
+        decoded landm predictions
+    """
+    a = priors[:, :2] + pre[:, :2] * variances[0] * priors[:, 2:]
+    b = priors[:, :2] + pre[:, 2:4] * variances[0] * priors[:, 2:]
+    c = priors[:, :2] + pre[:, 4:6] * variances[0] * priors[:, 2:]
+    d = priors[:, :2] + pre[:, 6:8] * variances[0] * priors[:, 2:]
+    e = priors[:, :2] + pre[:, 8:10] * variances[0] * priors[:, 2:]
+    landms = torch.cat((a, b, c, d, e), dim=1)
+    return landms
diff --git a/modelscope/models/cv/face_detection/mtcnn/__init__.py b/modelscope/models/cv/face_detection/mtcnn/__init__.py
new file mode 100644
index 00000000..b11c4740
--- /dev/null
+++ b/modelscope/models/cv/face_detection/mtcnn/__init__.py
@@ -0,0 +1 @@
+from .models.detector import MtcnnFaceDetector
diff --git a/modelscope/models/cv/face_detection/mtcnn/models/__init__.py b/modelscope/models/cv/face_detection/mtcnn/models/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/face_detection/mtcnn/models/box_utils.py b/modelscope/models/cv/face_detection/mtcnn/models/box_utils.py
new file mode 100644
index 00000000..f6a27b05
--- /dev/null
+++ b/modelscope/models/cv/face_detection/mtcnn/models/box_utils.py
@@ -0,0 +1,240 @@
+# The implementation is based on mtcnn, available at https://github.com/TropComplique/mtcnn-pytorch
+import numpy as np
+from PIL import Image
+
+
+def nms(boxes, overlap_threshold=0.5, mode='union'):
+    """Non-maximum suppression.
+
+    Arguments:
+        boxes: a float numpy array of shape [n, 5],
+            where each row is (xmin, ymin, xmax, ymax, score).
+        overlap_threshold: a float number.
+        mode: 'union' or 'min'.
+
+    Returns:
+        list with indices of the selected boxes
+    """
+
+    # if there are no boxes, return the empty list
+    if len(boxes) == 0:
+        return []
+
+    # list of picked indices
+    pick = []
+
+    # grab the coordinates of the bounding boxes
+    x1, y1, x2, y2, score = [boxes[:, i] for i in range(5)]
+
+    area = (x2 - x1 + 1.0) * (y2 - y1 + 1.0)
+    ids = np.argsort(score)  # in increasing order
+
+    while len(ids) > 0:
+
+        # grab index of the largest value
+        last = len(ids) - 1
+        i = ids[last]
+        pick.append(i)
+
+        # compute intersections
+        # of the box with the largest score
+        # with the rest of boxes
+
+        # left top corner of intersection boxes
+        ix1 = np.maximum(x1[i], x1[ids[:last]])
+        iy1 = np.maximum(y1[i], y1[ids[:last]])
+
+        # right bottom corner of intersection boxes
+        ix2 = np.minimum(x2[i], x2[ids[:last]])
+        iy2 = np.minimum(y2[i], y2[ids[:last]])
+
+        # width and height of intersection boxes
+        w = np.maximum(0.0, ix2 - ix1 + 1.0)
+        h = np.maximum(0.0, iy2 - iy1 + 1.0)
+
+        # intersections' areas
+        inter = w * h
+        if mode == 'min':
+            overlap = inter / np.minimum(area[i], area[ids[:last]])
+        elif mode == 'union':
+            # intersection over union (IoU)
+            overlap = inter / (area[i] + area[ids[:last]] - inter)
+
+        # delete all boxes where overlap is too big
+        ids = np.delete(
+            ids,
+            np.concatenate([[last],
+                            np.where(overlap > overlap_threshold)[0]]))
+
+    return pick
+
+
+def convert_to_square(bboxes):
+    """Convert bounding boxes to a square form.
+
+    Arguments:
+        bboxes: a float numpy array of shape [n, 5].
+
+    Returns:
+        a float numpy array of shape [n, 5],
+            squared bounding boxes.
+    """
+
+    square_bboxes = np.zeros_like(bboxes)
+    x1, y1, x2, y2 = [bboxes[:, i] for i in range(4)]
+    h = y2 - y1 + 1.0
+    w = x2 - x1 + 1.0
+    max_side = np.maximum(h, w)
+    square_bboxes[:, 0] = x1 + w * 0.5 - max_side * 0.5
+    square_bboxes[:, 1] = y1 + h * 0.5 - max_side * 0.5
+    square_bboxes[:, 2] = square_bboxes[:, 0] + max_side - 1.0
+    square_bboxes[:, 3] = square_bboxes[:, 1] + max_side - 1.0
+    return square_bboxes
+
+
+def calibrate_box(bboxes, offsets):
+    """Transform bounding boxes to be more like true bounding boxes.
+    'offsets' is one of the outputs of the nets.
+
+    Arguments:
+        bboxes: a float numpy array of shape [n, 5].
+        offsets: a float numpy array of shape [n, 4].
+
+    Returns:
+        a float numpy array of shape [n, 5].
+    """
+    x1, y1, x2, y2 = [bboxes[:, i] for i in range(4)]
+    w = x2 - x1 + 1.0
+    h = y2 - y1 + 1.0
+    w = np.expand_dims(w, 1)
+    h = np.expand_dims(h, 1)
+
+    # this is what happening here:
+    # tx1, ty1, tx2, ty2 = [offsets[:, i] for i in range(4)]
+    # x1_true = x1 + tx1*w
+    # y1_true = y1 + ty1*h
+    # x2_true = x2 + tx2*w
+    # y2_true = y2 + ty2*h
+    # below is just more compact form of this
+
+    # are offsets always such that
+    # x1 < x2 and y1 < y2 ?
+
+    translation = np.hstack([w, h, w, h]) * offsets
+    bboxes[:, 0:4] = bboxes[:, 0:4] + translation
+    return bboxes
+
+
+def get_image_boxes(bounding_boxes, img, size=24):
+    """Cut out boxes from the image.
+
+    Arguments:
+        bounding_boxes: a float numpy array of shape [n, 5].
+        img: an instance of PIL.Image.
+        size: an integer, size of cutouts.
+
+    Returns:
+        a float numpy array of shape [n, 3, size, size].
+    """
+
+    num_boxes = len(bounding_boxes)
+    width, height = img.size
+
+    [dy, edy, dx, edx, y, ey, x, ex, w,
+     h] = correct_bboxes(bounding_boxes, width, height)
+    img_boxes = np.zeros((num_boxes, 3, size, size), 'float32')
+
+    for i in range(num_boxes):
+        img_box = np.zeros((h[i], w[i], 3), 'uint8')
+
+        img_array = np.asarray(img, 'uint8')
+        img_box[dy[i]:(edy[i] + 1), dx[i]:(edx[i] + 1), :] =\
+            img_array[y[i]:(ey[i] + 1), x[i]:(ex[i] + 1), :]
+
+        # resize
+        img_box = Image.fromarray(img_box)
+        img_box = img_box.resize((size, size), Image.BILINEAR)
+        img_box = np.asarray(img_box, 'float32')
+
+        img_boxes[i, :, :, :] = _preprocess(img_box)
+
+    return img_boxes
+
+
+def correct_bboxes(bboxes, width, height):
+    """Crop boxes that are too big and get coordinates
+    with respect to cutouts.
+
+    Arguments:
+        bboxes: a float numpy array of shape [n, 5],
+            where each row is (xmin, ymin, xmax, ymax, score).
+        width: a float number.
+        height: a float number.
+
+    Returns:
+        dy, dx, edy, edx: a int numpy arrays of shape [n],
+            coordinates of the boxes with respect to the cutouts.
+        y, x, ey, ex: a int numpy arrays of shape [n],
+            corrected ymin, xmin, ymax, xmax.
+        h, w: a int numpy arrays of shape [n],
+            just heights and widths of boxes.
+
+        in the following order:
+            [dy, edy, dx, edx, y, ey, x, ex, w, h].
+    """
+
+    x1, y1, x2, y2 = [bboxes[:, i] for i in range(4)]
+    w, h = x2 - x1 + 1.0, y2 - y1 + 1.0
+    num_boxes = bboxes.shape[0]
+
+    # 'e' stands for end
+    # (x, y) -> (ex, ey)
+    x, y, ex, ey = x1, y1, x2, y2
+
+    # we need to cut out a box from the image.
+    # (x, y, ex, ey) are corrected coordinates of the box
+    # in the image.
+    # (dx, dy, edx, edy) are coordinates of the box in the cutout
+    # from the image.
+    dx, dy = np.zeros((num_boxes, )), np.zeros((num_boxes, ))
+    edx, edy = w.copy() - 1.0, h.copy() - 1.0
+
+    # if box's bottom right corner is too far right
+    ind = np.where(ex > width - 1.0)[0]
+    edx[ind] = w[ind] + width - 2.0 - ex[ind]
+    ex[ind] = width - 1.0
+
+    # if box's bottom right corner is too low
+    ind = np.where(ey > height - 1.0)[0]
+    edy[ind] = h[ind] + height - 2.0 - ey[ind]
+    ey[ind] = height - 1.0
+
+    # if box's top left corner is too far left
+    ind = np.where(x < 0.0)[0]
+    dx[ind] = 0.0 - x[ind]
+    x[ind] = 0.0
+
+    # if box's top left corner is too high
+    ind = np.where(y < 0.0)[0]
+    dy[ind] = 0.0 - y[ind]
+    y[ind] = 0.0
+
+    return_list = [dy, edy, dx, edx, y, ey, x, ex, w, h]
+    return_list = [i.astype('int32') for i in return_list]
+
+    return return_list
+
+
+def _preprocess(img):
+    """Preprocessing step before feeding the network.
+
+    Arguments:
+        img: a float numpy array of shape [h, w, c].
+
+    Returns:
+        a float numpy array of shape [1, c, h, w].
+    """
+    img = img.transpose((2, 0, 1))
+    img = np.expand_dims(img, 0)
+    img = (img - 127.5) * 0.0078125
+    return img
diff --git a/modelscope/models/cv/face_detection/mtcnn/models/detector.py b/modelscope/models/cv/face_detection/mtcnn/models/detector.py
new file mode 100644
index 00000000..9c3aca3a
--- /dev/null
+++ b/modelscope/models/cv/face_detection/mtcnn/models/detector.py
@@ -0,0 +1,149 @@
+# The implementation is based on mtcnn, available at https://github.com/TropComplique/mtcnn-pytorch
+import os
+
+import numpy as np
+import torch
+import torch.backends.cudnn as cudnn
+from PIL import Image
+from torch.autograd import Variable
+
+from modelscope.metainfo import Models
+from modelscope.models.base import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import Tasks
+from .box_utils import calibrate_box, convert_to_square, get_image_boxes, nms
+from .first_stage import run_first_stage
+from .get_nets import ONet, PNet, RNet
+
+
+@MODELS.register_module(Tasks.face_detection, module_name=Models.mtcnn)
+class MtcnnFaceDetector(TorchModel):
+
+    def __init__(self, model_path, device='cuda'):
+        super().__init__(model_path)
+        torch.set_grad_enabled(False)
+        cudnn.benchmark = True
+        self.model_path = model_path
+        self.device = device
+
+        self.pnet = PNet(model_path=os.path.join(self.model_path, 'pnet.npy'))
+        self.rnet = RNet(model_path=os.path.join(self.model_path, 'rnet.npy'))
+        self.onet = ONet(model_path=os.path.join(self.model_path, 'onet.npy'))
+
+        self.pnet = self.pnet.to(device)
+        self.rnet = self.rnet.to(device)
+        self.onet = self.onet.to(device)
+
+    def forward(self, input):
+        image = Image.fromarray(np.uint8(input['img'].cpu().numpy()))
+        pnet = self.pnet
+        rnet = self.rnet
+        onet = self.onet
+        onet.eval()
+
+        min_face_size = 20.0
+        thresholds = [0.7, 0.8, 0.9]
+        nms_thresholds = [0.7, 0.7, 0.7]
+
+        # BUILD AN IMAGE PYRAMID
+        width, height = image.size
+        min_length = min(height, width)
+
+        min_detection_size = 12
+        factor = 0.707  # sqrt(0.5)
+
+        # scales for scaling the image
+        scales = []
+
+        m = min_detection_size / min_face_size
+        min_length *= m
+
+        factor_count = 0
+        while min_length > min_detection_size:
+            scales.append(m * factor**factor_count)
+            min_length *= factor
+            factor_count += 1
+
+        # STAGE 1
+
+        # it will be returned
+        bounding_boxes = []
+
+        # run P-Net on different scales
+        for s in scales:
+            boxes = run_first_stage(
+                image,
+                pnet,
+                scale=s,
+                threshold=thresholds[0],
+                device=self.device)
+            bounding_boxes.append(boxes)
+
+        # collect boxes (and offsets, and scores) from different scales
+        bounding_boxes = [i for i in bounding_boxes if i is not None]
+        bounding_boxes = np.vstack(bounding_boxes)
+
+        keep = nms(bounding_boxes[:, 0:5], nms_thresholds[0])
+        bounding_boxes = bounding_boxes[keep]
+
+        # use offsets predicted by pnet to transform bounding boxes
+        bounding_boxes = calibrate_box(bounding_boxes[:, 0:5],
+                                       bounding_boxes[:, 5:])
+        # shape [n_boxes, 5]
+
+        bounding_boxes = convert_to_square(bounding_boxes)
+        bounding_boxes[:, 0:4] = np.round(bounding_boxes[:, 0:4])
+
+        # STAGE 2
+
+        img_boxes = get_image_boxes(bounding_boxes, image, size=24)
+        img_boxes = Variable(torch.FloatTensor(img_boxes), volatile=True)
+        output = rnet(img_boxes.to(self.device))
+        offsets = output[0].cpu().data.numpy()  # shape [n_boxes, 4]
+        probs = output[1].cpu().data.numpy()  # shape [n_boxes, 2]
+
+        keep = np.where(probs[:, 1] > thresholds[1])[0]
+        bounding_boxes = bounding_boxes[keep]
+        bounding_boxes[:, 4] = probs[keep, 1].reshape((-1, ))
+        offsets = offsets[keep]
+
+        keep = nms(bounding_boxes, nms_thresholds[1])
+        bounding_boxes = bounding_boxes[keep]
+        bounding_boxes = calibrate_box(bounding_boxes, offsets[keep])
+        bounding_boxes = convert_to_square(bounding_boxes)
+        bounding_boxes[:, 0:4] = np.round(bounding_boxes[:, 0:4])
+
+        # STAGE 3
+
+        img_boxes = get_image_boxes(bounding_boxes, image, size=48)
+        if len(img_boxes) == 0:
+            return [], []
+        img_boxes = Variable(torch.FloatTensor(img_boxes), volatile=True)
+        output = onet(img_boxes.to(self.device))
+        landmarks = output[0].cpu().data.numpy()  # shape [n_boxes, 10]
+        offsets = output[1].cpu().data.numpy()  # shape [n_boxes, 4]
+        probs = output[2].cpu().data.numpy()  # shape [n_boxes, 2]
+
+        keep = np.where(probs[:, 1] > thresholds[2])[0]
+        bounding_boxes = bounding_boxes[keep]
+        bounding_boxes[:, 4] = probs[keep, 1].reshape((-1, ))
+        offsets = offsets[keep]
+        landmarks = landmarks[keep]
+
+        # compute landmark points
+        width = bounding_boxes[:, 2] - bounding_boxes[:, 0] + 1.0
+        height = bounding_boxes[:, 3] - bounding_boxes[:, 1] + 1.0
+        xmin, ymin = bounding_boxes[:, 0], bounding_boxes[:, 1]
+        landmarks[:, 0:5] = np.expand_dims(
+            xmin, 1) + np.expand_dims(width, 1) * landmarks[:, 0:5]
+        landmarks[:, 5:10] = np.expand_dims(
+            ymin, 1) + np.expand_dims(height, 1) * landmarks[:, 5:10]
+
+        bounding_boxes = calibrate_box(bounding_boxes, offsets)
+        keep = nms(bounding_boxes, nms_thresholds[2], mode='min')
+        bounding_boxes = bounding_boxes[keep]
+        landmarks = landmarks[keep]
+        landmarks = landmarks.reshape(-1, 2, 5).transpose(
+            (0, 2, 1)).reshape(-1, 10)
+
+        return bounding_boxes, landmarks
diff --git a/modelscope/models/cv/face_detection/mtcnn/models/first_stage.py b/modelscope/models/cv/face_detection/mtcnn/models/first_stage.py
new file mode 100644
index 00000000..e2aba47e
--- /dev/null
+++ b/modelscope/models/cv/face_detection/mtcnn/models/first_stage.py
@@ -0,0 +1,100 @@
+# The implementation is based on mtcnn, available at https://github.com/TropComplique/mtcnn-pytorch
+import math
+
+import numpy as np
+import torch
+from PIL import Image
+from torch.autograd import Variable
+
+from .box_utils import _preprocess, nms
+
+
+def run_first_stage(image, net, scale, threshold, device='cuda'):
+    """Run P-Net, generate bounding boxes, and do NMS.
+
+    Arguments:
+        image: an instance of PIL.Image.
+        net: an instance of pytorch's nn.Module, P-Net.
+        scale: a float number,
+            scale width and height of the image by this number.
+        threshold: a float number,
+            threshold on the probability of a face when generating
+            bounding boxes from predictions of the net.
+
+    Returns:
+        a float numpy array of shape [n_boxes, 9],
+            bounding boxes with scores and offsets (4 + 1 + 4).
+    """
+
+    # scale the image and convert it to a float array
+    width, height = image.size
+    sw, sh = math.ceil(width * scale), math.ceil(height * scale)
+    img = image.resize((sw, sh), Image.BILINEAR)
+    img = np.asarray(img, 'float32')
+
+    img = Variable(
+        torch.FloatTensor(_preprocess(img)), volatile=True).to(device)
+    output = net(img)
+    probs = output[1].cpu().data.numpy()[0, 1, :, :]
+    offsets = output[0].cpu().data.numpy()
+    # probs: probability of a face at each sliding window
+    # offsets: transformations to true bounding boxes
+
+    boxes = _generate_bboxes(probs, offsets, scale, threshold)
+    if len(boxes) == 0:
+        return None
+
+    keep = nms(boxes[:, 0:5], overlap_threshold=0.5)
+    return boxes[keep]
+
+
+def _generate_bboxes(probs, offsets, scale, threshold):
+    """Generate bounding boxes at places
+    where there is probably a face.
+
+    Arguments:
+        probs: a float numpy array of shape [n, m].
+        offsets: a float numpy array of shape [1, 4, n, m].
+        scale: a float number,
+            width and height of the image were scaled by this number.
+        threshold: a float number.
+
+    Returns:
+        a float numpy array of shape [n_boxes, 9]
+    """
+
+    # applying P-Net is equivalent, in some sense, to
+    # moving 12x12 window with stride 2
+    stride = 2
+    cell_size = 12
+
+    # indices of boxes where there is probably a face
+    inds = np.where(probs > threshold)
+
+    if inds[0].size == 0:
+        return np.array([])
+
+    # transformations of bounding boxes
+    tx1, ty1, tx2, ty2 = [offsets[0, i, inds[0], inds[1]] for i in range(4)]
+    # they are defined as:
+    # w = x2 - x1 + 1
+    # h = y2 - y1 + 1
+    # x1_true = x1 + tx1*w
+    # x2_true = x2 + tx2*w
+    # y1_true = y1 + ty1*h
+    # y2_true = y2 + ty2*h
+
+    offsets = np.array([tx1, ty1, tx2, ty2])
+    score = probs[inds[0], inds[1]]
+
+    # P-Net is applied to scaled images
+    # so we need to rescale bounding boxes back
+    bounding_boxes = np.vstack([
+        np.round((stride * inds[1] + 1.0) / scale),
+        np.round((stride * inds[0] + 1.0) / scale),
+        np.round((stride * inds[1] + 1.0 + cell_size) / scale),
+        np.round((stride * inds[0] + 1.0 + cell_size) / scale), score, offsets
+    ])
+    # why one is added?
+
+    return bounding_boxes.T
diff --git a/modelscope/models/cv/face_detection/mtcnn/models/get_nets.py b/modelscope/models/cv/face_detection/mtcnn/models/get_nets.py
new file mode 100644
index 00000000..5fbbd33b
--- /dev/null
+++ b/modelscope/models/cv/face_detection/mtcnn/models/get_nets.py
@@ -0,0 +1,160 @@
+# The implementation is based on mtcnn, available at https://github.com/TropComplique/mtcnn-pytorch
+from collections import OrderedDict
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class Flatten(nn.Module):
+
+    def __init__(self):
+        super(Flatten, self).__init__()
+
+    def forward(self, x):
+        """
+        Arguments:
+            x: a float tensor with shape [batch_size, c, h, w].
+        Returns:
+            a float tensor with shape [batch_size, c*h*w].
+        """
+
+        # without this pretrained model isn't working
+        x = x.transpose(3, 2).contiguous()
+
+        return x.view(x.size(0), -1)
+
+
+class PNet(nn.Module):
+
+    def __init__(self, model_path=None):
+
+        super(PNet, self).__init__()
+
+        # suppose we have input with size HxW, then
+        # after first layer: H - 2,
+        # after pool: ceil((H - 2)/2),
+        # after second conv: ceil((H - 2)/2) - 2,
+        # after last conv: ceil((H - 2)/2) - 4,
+        # and the same for W
+
+        self.features = nn.Sequential(
+            OrderedDict([('conv1', nn.Conv2d(3, 10, 3, 1)),
+                         ('prelu1', nn.PReLU(10)),
+                         ('pool1', nn.MaxPool2d(2, 2, ceil_mode=True)),
+                         ('conv2', nn.Conv2d(10, 16, 3, 1)),
+                         ('prelu2', nn.PReLU(16)),
+                         ('conv3', nn.Conv2d(16, 32, 3, 1)),
+                         ('prelu3', nn.PReLU(32))]))
+
+        self.conv4_1 = nn.Conv2d(32, 2, 1, 1)
+        self.conv4_2 = nn.Conv2d(32, 4, 1, 1)
+
+        weights = np.load(model_path, allow_pickle=True)[()]
+        for n, p in self.named_parameters():
+            p.data = torch.FloatTensor(weights[n])
+
+    def forward(self, x):
+        """
+        Arguments:
+            x: a float tensor with shape [batch_size, 3, h, w].
+        Returns:
+            b: a float tensor with shape [batch_size, 4, h', w'].
+            a: a float tensor with shape [batch_size, 2, h', w'].
+        """
+        x = self.features(x)
+        a = self.conv4_1(x)
+        b = self.conv4_2(x)
+        a = F.softmax(a)
+        return b, a
+
+
+class RNet(nn.Module):
+
+    def __init__(self, model_path=None):
+
+        super(RNet, self).__init__()
+
+        self.features = nn.Sequential(
+            OrderedDict([('conv1', nn.Conv2d(3, 28, 3, 1)),
+                         ('prelu1', nn.PReLU(28)),
+                         ('pool1', nn.MaxPool2d(3, 2, ceil_mode=True)),
+                         ('conv2', nn.Conv2d(28, 48, 3, 1)),
+                         ('prelu2', nn.PReLU(48)),
+                         ('pool2', nn.MaxPool2d(3, 2, ceil_mode=True)),
+                         ('conv3', nn.Conv2d(48, 64, 2, 1)),
+                         ('prelu3', nn.PReLU(64)), ('flatten', Flatten()),
+                         ('conv4', nn.Linear(576, 128)),
+                         ('prelu4', nn.PReLU(128))]))
+
+        self.conv5_1 = nn.Linear(128, 2)
+        self.conv5_2 = nn.Linear(128, 4)
+
+        weights = np.load(model_path, allow_pickle=True)[()]
+        for n, p in self.named_parameters():
+            p.data = torch.FloatTensor(weights[n])
+
+    def forward(self, x):
+        """
+        Arguments:
+            x: a float tensor with shape [batch_size, 3, h, w].
+        Returns:
+            b: a float tensor with shape [batch_size, 4].
+            a: a float tensor with shape [batch_size, 2].
+        """
+        x = self.features(x)
+        a = self.conv5_1(x)
+        b = self.conv5_2(x)
+        a = F.softmax(a)
+        return b, a
+
+
+class ONet(nn.Module):
+
+    def __init__(self, model_path=None):
+
+        super(ONet, self).__init__()
+
+        self.features = nn.Sequential(
+            OrderedDict([
+                ('conv1', nn.Conv2d(3, 32, 3, 1)),
+                ('prelu1', nn.PReLU(32)),
+                ('pool1', nn.MaxPool2d(3, 2, ceil_mode=True)),
+                ('conv2', nn.Conv2d(32, 64, 3, 1)),
+                ('prelu2', nn.PReLU(64)),
+                ('pool2', nn.MaxPool2d(3, 2, ceil_mode=True)),
+                ('conv3', nn.Conv2d(64, 64, 3, 1)),
+                ('prelu3', nn.PReLU(64)),
+                ('pool3', nn.MaxPool2d(2, 2, ceil_mode=True)),
+                ('conv4', nn.Conv2d(64, 128, 2, 1)),
+                ('prelu4', nn.PReLU(128)),
+                ('flatten', Flatten()),
+                ('conv5', nn.Linear(1152, 256)),
+                ('drop5', nn.Dropout(0.25)),
+                ('prelu5', nn.PReLU(256)),
+            ]))
+
+        self.conv6_1 = nn.Linear(256, 2)
+        self.conv6_2 = nn.Linear(256, 4)
+        self.conv6_3 = nn.Linear(256, 10)
+
+        weights = np.load(model_path, allow_pickle=True)[()]
+        for n, p in self.named_parameters():
+            p.data = torch.FloatTensor(weights[n])
+
+    def forward(self, x):
+        """
+        Arguments:
+            x: a float tensor with shape [batch_size, 3, h, w].
+        Returns:
+            c: a float tensor with shape [batch_size, 10].
+            b: a float tensor with shape [batch_size, 4].
+            a: a float tensor with shape [batch_size, 2].
+        """
+        x = self.features(x)
+        a = self.conv6_1(x)
+        b = self.conv6_2(x)
+        c = self.conv6_3(x)
+        a = F.softmax(a)
+        return c, b, a
diff --git a/modelscope/models/cv/face_detection/ulfd_slim/__init__.py b/modelscope/models/cv/face_detection/ulfd_slim/__init__.py
new file mode 100644
index 00000000..41a2226a
--- /dev/null
+++ b/modelscope/models/cv/face_detection/ulfd_slim/__init__.py
@@ -0,0 +1 @@
+from .detection import UlfdFaceDetector
diff --git a/modelscope/models/cv/face_detection/ulfd_slim/detection.py b/modelscope/models/cv/face_detection/ulfd_slim/detection.py
new file mode 100755
index 00000000..c0e2da6e
--- /dev/null
+++ b/modelscope/models/cv/face_detection/ulfd_slim/detection.py
@@ -0,0 +1,44 @@
+# The implementation is based on ULFD, available at
+# https://github.com/Linzaer/Ultra-Light-Fast-Generic-Face-Detector-1MB
+import os
+
+import cv2
+import numpy as np
+import torch
+import torch.backends.cudnn as cudnn
+import torch.nn.functional as F
+
+from modelscope.metainfo import Models
+from modelscope.models.base import Tensor, TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import ModelFile, Tasks
+from .vision.ssd.fd_config import define_img_size
+from .vision.ssd.mb_tiny_fd import (create_mb_tiny_fd,
+                                    create_mb_tiny_fd_predictor)
+
+define_img_size(640)
+
+
+@MODELS.register_module(Tasks.face_detection, module_name=Models.ulfd)
+class UlfdFaceDetector(TorchModel):
+
+    def __init__(self, model_path, device='cuda'):
+        super().__init__(model_path)
+        torch.set_grad_enabled(False)
+        cudnn.benchmark = True
+        self.model_path = model_path
+        self.device = device
+        self.net = create_mb_tiny_fd(2, is_test=True, device=device)
+        self.predictor = create_mb_tiny_fd_predictor(
+            self.net, candidate_size=1500, device=device)
+        self.net.load(model_path)
+        self.net = self.net.to(device)
+
+    def forward(self, input):
+        img_raw = input['img']
+        img = np.array(img_raw.cpu().detach())
+        img = img[:, :, ::-1]
+        prob_th = 0.85
+        keep_top_k = 750
+        boxes, labels, probs = self.predictor.predict(img, keep_top_k, prob_th)
+        return boxes, probs
diff --git a/modelscope/models/cv/face_detection/ulfd_slim/vision/__init__.py b/modelscope/models/cv/face_detection/ulfd_slim/vision/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/face_detection/ulfd_slim/vision/box_utils.py b/modelscope/models/cv/face_detection/ulfd_slim/vision/box_utils.py
new file mode 100644
index 00000000..46d3b890
--- /dev/null
+++ b/modelscope/models/cv/face_detection/ulfd_slim/vision/box_utils.py
@@ -0,0 +1,124 @@
+# The implementation is based on ULFD, available at
+# https://github.com/Linzaer/Ultra-Light-Fast-Generic-Face-Detector-1MB
+import math
+
+import torch
+
+
+def hard_nms(box_scores, iou_threshold, top_k=-1, candidate_size=200):
+    """
+
+    Args:
+        box_scores (N, 5): boxes in corner-form and probabilities.
+        iou_threshold: intersection over union threshold.
+        top_k: keep top_k results. If k <= 0, keep all the results.
+        candidate_size: only consider the candidates with the highest scores.
+    Returns:
+         picked: a list of indexes of the kept boxes
+    """
+    scores = box_scores[:, -1]
+    boxes = box_scores[:, :-1]
+    picked = []
+    _, indexes = scores.sort(descending=True)
+    indexes = indexes[:candidate_size]
+    while len(indexes) > 0:
+        current = indexes[0]
+        picked.append(current.item())
+        if 0 < top_k == len(picked) or len(indexes) == 1:
+            break
+        current_box = boxes[current, :]
+        indexes = indexes[1:]
+        rest_boxes = boxes[indexes, :]
+        iou = iou_of(
+            rest_boxes,
+            current_box.unsqueeze(0),
+        )
+        indexes = indexes[iou <= iou_threshold]
+
+    return box_scores[picked, :]
+
+
+def nms(box_scores,
+        nms_method=None,
+        score_threshold=None,
+        iou_threshold=None,
+        sigma=0.5,
+        top_k=-1,
+        candidate_size=200):
+    return hard_nms(
+        box_scores, iou_threshold, top_k, candidate_size=candidate_size)
+
+
+def generate_priors(feature_map_list,
+                    shrinkage_list,
+                    image_size,
+                    min_boxes,
+                    clamp=True) -> torch.Tensor:
+    priors = []
+    for index in range(0, len(feature_map_list[0])):
+        scale_w = image_size[0] / shrinkage_list[0][index]
+        scale_h = image_size[1] / shrinkage_list[1][index]
+        for j in range(0, feature_map_list[1][index]):
+            for i in range(0, feature_map_list[0][index]):
+                x_center = (i + 0.5) / scale_w
+                y_center = (j + 0.5) / scale_h
+
+                for min_box in min_boxes[index]:
+                    w = min_box / image_size[0]
+                    h = min_box / image_size[1]
+                    priors.append([x_center, y_center, w, h])
+    priors = torch.tensor(priors)
+    if clamp:
+        torch.clamp(priors, 0.0, 1.0, out=priors)
+    return priors
+
+
+def convert_locations_to_boxes(locations, priors, center_variance,
+                               size_variance):
+    # priors can have one dimension less.
+    if priors.dim() + 1 == locations.dim():
+        priors = priors.unsqueeze(0)
+    a = locations[..., :2] * center_variance * priors[...,
+                                                      2:] + priors[..., :2]
+    b = torch.exp(locations[..., 2:] * size_variance) * priors[..., 2:]
+
+    return torch.cat([a, b], dim=locations.dim() - 1)
+
+
+def center_form_to_corner_form(locations):
+    a = locations[..., :2] - locations[..., 2:] / 2
+    b = locations[..., :2] + locations[..., 2:] / 2
+    return torch.cat([a, b], locations.dim() - 1)
+
+
+def iou_of(boxes0, boxes1, eps=1e-5):
+    """Return intersection-over-union (Jaccard index) of boxes.
+
+    Args:
+        boxes0 (N, 4): ground truth boxes.
+        boxes1 (N or 1, 4): predicted boxes.
+        eps: a small number to avoid 0 as denominator.
+    Returns:
+        iou (N): IoU values.
+    """
+    overlap_left_top = torch.max(boxes0[..., :2], boxes1[..., :2])
+    overlap_right_bottom = torch.min(boxes0[..., 2:], boxes1[..., 2:])
+
+    overlap_area = area_of(overlap_left_top, overlap_right_bottom)
+    area0 = area_of(boxes0[..., :2], boxes0[..., 2:])
+    area1 = area_of(boxes1[..., :2], boxes1[..., 2:])
+    return overlap_area / (area0 + area1 - overlap_area + eps)
+
+
+def area_of(left_top, right_bottom) -> torch.Tensor:
+    """Compute the areas of rectangles given two corners.
+
+    Args:
+        left_top (N, 2): left top corner.
+        right_bottom (N, 2): right bottom corner.
+
+    Returns:
+        area (N): return the area.
+    """
+    hw = torch.clamp(right_bottom - left_top, min=0.0)
+    return hw[..., 0] * hw[..., 1]
diff --git a/modelscope/models/cv/face_detection/ulfd_slim/vision/mb_tiny.py b/modelscope/models/cv/face_detection/ulfd_slim/vision/mb_tiny.py
new file mode 100644
index 00000000..8bbcef41
--- /dev/null
+++ b/modelscope/models/cv/face_detection/ulfd_slim/vision/mb_tiny.py
@@ -0,0 +1,49 @@
+# The implementation is based on ULFD, available at
+# https://github.com/Linzaer/Ultra-Light-Fast-Generic-Face-Detector-1MB
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class Mb_Tiny(nn.Module):
+
+    def __init__(self, num_classes=2):
+        super(Mb_Tiny, self).__init__()
+        self.base_channel = 8 * 2
+
+        def conv_bn(inp, oup, stride):
+            return nn.Sequential(
+                nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
+                nn.BatchNorm2d(oup), nn.ReLU(inplace=True))
+
+        def conv_dw(inp, oup, stride):
+            return nn.Sequential(
+                nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False),
+                nn.BatchNorm2d(inp),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(oup),
+                nn.ReLU(inplace=True),
+            )
+
+        self.model = nn.Sequential(
+            conv_bn(3, self.base_channel, 2),  # 160*120
+            conv_dw(self.base_channel, self.base_channel * 2, 1),
+            conv_dw(self.base_channel * 2, self.base_channel * 2, 2),  # 80*60
+            conv_dw(self.base_channel * 2, self.base_channel * 2, 1),
+            conv_dw(self.base_channel * 2, self.base_channel * 4, 2),  # 40*30
+            conv_dw(self.base_channel * 4, self.base_channel * 4, 1),
+            conv_dw(self.base_channel * 4, self.base_channel * 4, 1),
+            conv_dw(self.base_channel * 4, self.base_channel * 4, 1),
+            conv_dw(self.base_channel * 4, self.base_channel * 8, 2),  # 20*15
+            conv_dw(self.base_channel * 8, self.base_channel * 8, 1),
+            conv_dw(self.base_channel * 8, self.base_channel * 8, 1),
+            conv_dw(self.base_channel * 8, self.base_channel * 16, 2),  # 10*8
+            conv_dw(self.base_channel * 16, self.base_channel * 16, 1))
+        self.fc = nn.Linear(1024, num_classes)
+
+    def forward(self, x):
+        x = self.model(x)
+        x = F.avg_pool2d(x, 7)
+        x = x.view(-1, 1024)
+        x = self.fc(x)
+        return x
diff --git a/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/__init__.py b/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/data_preprocessing.py b/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/data_preprocessing.py
new file mode 100644
index 00000000..9251d67f
--- /dev/null
+++ b/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/data_preprocessing.py
@@ -0,0 +1,18 @@
+# The implementation is based on ULFD, available at
+# https://github.com/Linzaer/Ultra-Light-Fast-Generic-Face-Detector-1MB
+from ..transforms import Compose, Resize, SubtractMeans, ToTensor
+
+
+class PredictionTransform:
+
+    def __init__(self, size, mean=0.0, std=1.0):
+        self.transform = Compose([
+            Resize(size),
+            SubtractMeans(mean), lambda img, boxes=None, labels=None:
+            (img / std, boxes, labels),
+            ToTensor()
+        ])
+
+    def __call__(self, image):
+        image, _, _ = self.transform(image)
+        return image
diff --git a/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/fd_config.py b/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/fd_config.py
new file mode 100644
index 00000000..495a2fcd
--- /dev/null
+++ b/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/fd_config.py
@@ -0,0 +1,49 @@
+# The implementation is based on ULFD, available at
+# https://github.com/Linzaer/Ultra-Light-Fast-Generic-Face-Detector-1MB
+import numpy as np
+
+from ..box_utils import generate_priors
+
+image_mean_test = image_mean = np.array([127, 127, 127])
+image_std = 128.0
+iou_threshold = 0.3
+center_variance = 0.1
+size_variance = 0.2
+
+min_boxes = [[10, 16, 24], [32, 48], [64, 96], [128, 192, 256]]
+shrinkage_list = []
+image_size = [320, 240]  # default input size 320*240
+feature_map_w_h_list = [[40, 20, 10, 5], [30, 15, 8,
+                                          4]]  # default feature map size
+priors = []
+
+
+def define_img_size(size):
+    global image_size, feature_map_w_h_list, priors
+    img_size_dict = {
+        128: [128, 96],
+        160: [160, 120],
+        320: [320, 240],
+        480: [480, 360],
+        640: [640, 480],
+        1280: [1280, 960]
+    }
+    image_size = img_size_dict[size]
+
+    feature_map_w_h_list_dict = {
+        128: [[16, 8, 4, 2], [12, 6, 3, 2]],
+        160: [[20, 10, 5, 3], [15, 8, 4, 2]],
+        320: [[40, 20, 10, 5], [30, 15, 8, 4]],
+        480: [[60, 30, 15, 8], [45, 23, 12, 6]],
+        640: [[80, 40, 20, 10], [60, 30, 15, 8]],
+        1280: [[160, 80, 40, 20], [120, 60, 30, 15]]
+    }
+    feature_map_w_h_list = feature_map_w_h_list_dict[size]
+
+    for i in range(0, len(image_size)):
+        item_list = []
+        for k in range(0, len(feature_map_w_h_list[i])):
+            item_list.append(image_size[i] / feature_map_w_h_list[i][k])
+        shrinkage_list.append(item_list)
+    priors = generate_priors(feature_map_w_h_list, shrinkage_list, image_size,
+                             min_boxes)
diff --git a/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/mb_tiny_fd.py b/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/mb_tiny_fd.py
new file mode 100644
index 00000000..91ed268d
--- /dev/null
+++ b/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/mb_tiny_fd.py
@@ -0,0 +1,124 @@
+# The implementation is based on ULFD, available at
+# https://github.com/Linzaer/Ultra-Light-Fast-Generic-Face-Detector-1MB
+from torch.nn import Conv2d, ModuleList, ReLU, Sequential
+
+from ..mb_tiny import Mb_Tiny
+from . import fd_config as config
+from .predictor import Predictor
+from .ssd import SSD
+
+
+def SeperableConv2d(in_channels,
+                    out_channels,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0):
+    """Replace Conv2d with a depthwise Conv2d and Pointwise Conv2d.
+    """
+    return Sequential(
+        Conv2d(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            kernel_size=kernel_size,
+            groups=in_channels,
+            stride=stride,
+            padding=padding),
+        ReLU(),
+        Conv2d(
+            in_channels=in_channels, out_channels=out_channels, kernel_size=1),
+    )
+
+
+def create_mb_tiny_fd(num_classes, is_test=False, device='cuda'):
+    base_net = Mb_Tiny(2)
+    base_net_model = base_net.model  # disable dropout layer
+
+    source_layer_indexes = [8, 11, 13]
+    extras = ModuleList([
+        Sequential(
+            Conv2d(
+                in_channels=base_net.base_channel * 16,
+                out_channels=base_net.base_channel * 4,
+                kernel_size=1), ReLU(),
+            SeperableConv2d(
+                in_channels=base_net.base_channel * 4,
+                out_channels=base_net.base_channel * 16,
+                kernel_size=3,
+                stride=2,
+                padding=1), ReLU())
+    ])
+
+    regression_headers = ModuleList([
+        SeperableConv2d(
+            in_channels=base_net.base_channel * 4,
+            out_channels=3 * 4,
+            kernel_size=3,
+            padding=1),
+        SeperableConv2d(
+            in_channels=base_net.base_channel * 8,
+            out_channels=2 * 4,
+            kernel_size=3,
+            padding=1),
+        SeperableConv2d(
+            in_channels=base_net.base_channel * 16,
+            out_channels=2 * 4,
+            kernel_size=3,
+            padding=1),
+        Conv2d(
+            in_channels=base_net.base_channel * 16,
+            out_channels=3 * 4,
+            kernel_size=3,
+            padding=1)
+    ])
+
+    classification_headers = ModuleList([
+        SeperableConv2d(
+            in_channels=base_net.base_channel * 4,
+            out_channels=3 * num_classes,
+            kernel_size=3,
+            padding=1),
+        SeperableConv2d(
+            in_channels=base_net.base_channel * 8,
+            out_channels=2 * num_classes,
+            kernel_size=3,
+            padding=1),
+        SeperableConv2d(
+            in_channels=base_net.base_channel * 16,
+            out_channels=2 * num_classes,
+            kernel_size=3,
+            padding=1),
+        Conv2d(
+            in_channels=base_net.base_channel * 16,
+            out_channels=3 * num_classes,
+            kernel_size=3,
+            padding=1)
+    ])
+
+    return SSD(
+        num_classes,
+        base_net_model,
+        source_layer_indexes,
+        extras,
+        classification_headers,
+        regression_headers,
+        is_test=is_test,
+        config=config,
+        device=device)
+
+
+def create_mb_tiny_fd_predictor(net,
+                                candidate_size=200,
+                                nms_method=None,
+                                sigma=0.5,
+                                device=None):
+    predictor = Predictor(
+        net,
+        config.image_size,
+        config.image_mean_test,
+        config.image_std,
+        nms_method=nms_method,
+        iou_threshold=config.iou_threshold,
+        candidate_size=candidate_size,
+        sigma=sigma,
+        device=device)
+    return predictor
diff --git a/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/predictor.py b/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/predictor.py
new file mode 100644
index 00000000..f71820a5
--- /dev/null
+++ b/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/predictor.py
@@ -0,0 +1,80 @@
+# The implementation is based on ULFD, available at
+# https://github.com/Linzaer/Ultra-Light-Fast-Generic-Face-Detector-1MB
+import torch
+
+from .. import box_utils
+from .data_preprocessing import PredictionTransform
+
+
+class Predictor:
+
+    def __init__(self,
+                 net,
+                 size,
+                 mean=0.0,
+                 std=1.0,
+                 nms_method=None,
+                 iou_threshold=0.3,
+                 filter_threshold=0.85,
+                 candidate_size=200,
+                 sigma=0.5,
+                 device=None):
+        self.net = net
+        self.transform = PredictionTransform(size, mean, std)
+        self.iou_threshold = iou_threshold
+        self.filter_threshold = filter_threshold
+        self.candidate_size = candidate_size
+        self.nms_method = nms_method
+
+        self.sigma = sigma
+        if device:
+            self.device = device
+        else:
+            self.device = torch.device(
+                'cuda:0' if torch.cuda.is_available() else 'cpu')
+
+        self.net.to(self.device)
+        self.net.eval()
+
+    def predict(self, image, top_k=-1, prob_threshold=None):
+        height, width, _ = image.shape
+        image = self.transform(image)
+        images = image.unsqueeze(0)
+        images = images.to(self.device)
+        with torch.no_grad():
+            for i in range(1):
+                scores, boxes = self.net.forward(images)
+        boxes = boxes[0]
+        scores = scores[0]
+        if not prob_threshold:
+            prob_threshold = self.filter_threshold
+        # this version of nms is slower on GPU, so we move data to CPU.
+        picked_box_probs = []
+        picked_labels = []
+        for class_index in range(1, scores.size(1)):
+            probs = scores[:, class_index]
+            mask = probs > prob_threshold
+            probs = probs[mask]
+            if probs.size(0) == 0:
+                continue
+            subset_boxes = boxes[mask, :]
+            box_probs = torch.cat([subset_boxes, probs.reshape(-1, 1)], dim=1)
+            box_probs = box_utils.nms(
+                box_probs,
+                self.nms_method,
+                score_threshold=prob_threshold,
+                iou_threshold=self.iou_threshold,
+                sigma=self.sigma,
+                top_k=top_k,
+                candidate_size=self.candidate_size)
+            picked_box_probs.append(box_probs)
+            picked_labels.extend([class_index] * box_probs.size(0))
+        if not picked_box_probs:
+            return torch.tensor([]), torch.tensor([]), torch.tensor([])
+        picked_box_probs = torch.cat(picked_box_probs)
+        picked_box_probs[:, 0] *= width
+        picked_box_probs[:, 1] *= height
+        picked_box_probs[:, 2] *= width
+        picked_box_probs[:, 3] *= height
+        return picked_box_probs[:, :4], torch.tensor(
+            picked_labels), picked_box_probs[:, 4]
diff --git a/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/ssd.py b/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/ssd.py
new file mode 100644
index 00000000..08ff93a4
--- /dev/null
+++ b/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/ssd.py
@@ -0,0 +1,129 @@
+# The implementation is based on ULFD, available at
+# https://github.com/Linzaer/Ultra-Light-Fast-Generic-Face-Detector-1MB
+from collections import namedtuple
+from typing import List, Tuple
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .. import box_utils
+
+GraphPath = namedtuple('GraphPath', ['s0', 'name', 's1'])
+
+
+class SSD(nn.Module):
+
+    def __init__(self,
+                 num_classes: int,
+                 base_net: nn.ModuleList,
+                 source_layer_indexes: List[int],
+                 extras: nn.ModuleList,
+                 classification_headers: nn.ModuleList,
+                 regression_headers: nn.ModuleList,
+                 is_test=False,
+                 config=None,
+                 device=None):
+        """Compose a SSD model using the given components.
+        """
+        super(SSD, self).__init__()
+
+        self.num_classes = num_classes
+        self.base_net = base_net
+        self.source_layer_indexes = source_layer_indexes
+        self.extras = extras
+        self.classification_headers = classification_headers
+        self.regression_headers = regression_headers
+        self.is_test = is_test
+        self.config = config
+
+        # register layers in source_layer_indexes by adding them to a module list
+        self.source_layer_add_ons = nn.ModuleList([
+            t[1] for t in source_layer_indexes
+            if isinstance(t, tuple) and not isinstance(t, GraphPath)
+        ])
+        if device:
+            self.device = device
+        else:
+            self.device = torch.device(
+                'cuda:0' if torch.cuda.is_available() else 'cpu')
+        if is_test:
+            self.config = config
+            self.priors = config.priors.to(self.device)
+
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        confidences = []
+        locations = []
+        start_layer_index = 0
+        header_index = 0
+        end_layer_index = 0
+        for end_layer_index in self.source_layer_indexes:
+            if isinstance(end_layer_index, GraphPath):
+                path = end_layer_index
+                end_layer_index = end_layer_index.s0
+                added_layer = None
+            elif isinstance(end_layer_index, tuple):
+                added_layer = end_layer_index[1]
+                end_layer_index = end_layer_index[0]
+                path = None
+            else:
+                added_layer = None
+                path = None
+            for layer in self.base_net[start_layer_index:end_layer_index]:
+                x = layer(x)
+            if added_layer:
+                y = added_layer(x)
+            else:
+                y = x
+            if path:
+                sub = getattr(self.base_net[end_layer_index], path.name)
+                for layer in sub[:path.s1]:
+                    x = layer(x)
+                y = x
+                for layer in sub[path.s1:]:
+                    x = layer(x)
+                end_layer_index += 1
+            start_layer_index = end_layer_index
+            confidence, location = self.compute_header(header_index, y)
+            header_index += 1
+            confidences.append(confidence)
+            locations.append(location)
+
+        for layer in self.base_net[end_layer_index:]:
+            x = layer(x)
+
+        for layer in self.extras:
+            x = layer(x)
+            confidence, location = self.compute_header(header_index, x)
+            header_index += 1
+            confidences.append(confidence)
+            locations.append(location)
+
+        confidences = torch.cat(confidences, 1)
+        locations = torch.cat(locations, 1)
+
+        if self.is_test:
+            confidences = F.softmax(confidences, dim=2)
+            boxes = box_utils.convert_locations_to_boxes(
+                locations, self.priors, self.config.center_variance,
+                self.config.size_variance)
+            boxes = box_utils.center_form_to_corner_form(boxes)
+            return confidences, boxes
+        else:
+            return confidences, locations
+
+    def compute_header(self, i, x):
+        confidence = self.classification_headers[i](x)
+        confidence = confidence.permute(0, 2, 3, 1).contiguous()
+        confidence = confidence.view(confidence.size(0), -1, self.num_classes)
+
+        location = self.regression_headers[i](x)
+        location = location.permute(0, 2, 3, 1).contiguous()
+        location = location.view(location.size(0), -1, 4)
+
+        return confidence, location
+
+    def load(self, model):
+        self.load_state_dict(
+            torch.load(model, map_location=lambda storage, loc: storage))
diff --git a/modelscope/models/cv/face_detection/ulfd_slim/vision/transforms.py b/modelscope/models/cv/face_detection/ulfd_slim/vision/transforms.py
new file mode 100644
index 00000000..7c5331f1
--- /dev/null
+++ b/modelscope/models/cv/face_detection/ulfd_slim/vision/transforms.py
@@ -0,0 +1,56 @@
+# The implementation is based on ULFD, available at
+# https://github.com/Linzaer/Ultra-Light-Fast-Generic-Face-Detector-1MB
+import types
+
+import cv2
+import numpy as np
+import torch
+from numpy import random
+
+
+class Compose(object):
+    """Composes several augmentations together.
+    Args:
+        transforms (List[Transform]): list of transforms to compose.
+    Example:
+        >>> augmentations.Compose([
+        >>>     transforms.CenterCrop(10),
+        >>>     transforms.ToTensor(),
+        >>> ])
+    """
+
+    def __init__(self, transforms):
+        self.transforms = transforms
+
+    def __call__(self, img, boxes=None, labels=None):
+        for t in self.transforms:
+            img, boxes, labels = t(img, boxes, labels)
+        return img, boxes, labels
+
+
+class SubtractMeans(object):
+
+    def __init__(self, mean):
+        self.mean = np.array(mean, dtype=np.float32)
+
+    def __call__(self, image, boxes=None, labels=None):
+        image = image.astype(np.float32)
+        image -= self.mean
+        return image.astype(np.float32), boxes, labels
+
+
+class Resize(object):
+
+    def __init__(self, size=(300, 300)):
+        self.size = size
+
+    def __call__(self, image, boxes=None, labels=None):
+        image = cv2.resize(image, (self.size[0], self.size[1]))
+        return image, boxes, labels
+
+
+class ToTensor(object):
+
+    def __call__(self, cvimage, boxes=None, labels=None):
+        return torch.from_numpy(cvimage.astype(np.float32)).permute(
+            2, 0, 1), boxes, labels
diff --git a/modelscope/models/cv/face_recognition/align_face.py b/modelscope/models/cv/face_recognition/align_face.py
index a6469a10..0477375a 100644
--- a/modelscope/models/cv/face_recognition/align_face.py
+++ b/modelscope/models/cv/face_recognition/align_face.py
@@ -1,3 +1,7 @@
+"""
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/blob/master/python-package/insightface/utils/face_align.py
+"""
 import cv2
 import numpy as np
 from skimage import transform as trans
diff --git a/modelscope/models/cv/face_recognition/torchkit/backbone/__init__.py b/modelscope/models/cv/face_recognition/torchkit/backbone/__init__.py
index a58d8e17..afe89963 100755
--- a/modelscope/models/cv/face_recognition/torchkit/backbone/__init__.py
+++ b/modelscope/models/cv/face_recognition/torchkit/backbone/__init__.py
@@ -1,3 +1,5 @@
+# The implementation is adopted from TFace,made pubicly available under the Apache-2.0 license at
+# https://github.com/Tencent/TFace/blob/master/recognition/torchkit/backbone
 from .model_irse import (IR_18, IR_34, IR_50, IR_101, IR_152, IR_200, IR_SE_50,
                          IR_SE_101, IR_SE_152, IR_SE_200)
 from .model_resnet import ResNet_50, ResNet_101, ResNet_152
diff --git a/modelscope/models/cv/face_recognition/torchkit/backbone/common.py b/modelscope/models/cv/face_recognition/torchkit/backbone/common.py
index 426d2591..a1683225 100755
--- a/modelscope/models/cv/face_recognition/torchkit/backbone/common.py
+++ b/modelscope/models/cv/face_recognition/torchkit/backbone/common.py
@@ -1,3 +1,5 @@
+# The implementation is adopted from TFace,made pubicly available under the Apache-2.0 license at
+# https://github.com/Tencent/TFace/blob/master/recognition/torchkit/backbone/common.py
 import torch
 import torch.nn as nn
 from torch.nn import (BatchNorm1d, BatchNorm2d, Conv2d, Linear, Module, ReLU,
diff --git a/modelscope/models/cv/face_recognition/torchkit/backbone/model_irse.py b/modelscope/models/cv/face_recognition/torchkit/backbone/model_irse.py
index 4fb7ee9c..1982ca05 100755
--- a/modelscope/models/cv/face_recognition/torchkit/backbone/model_irse.py
+++ b/modelscope/models/cv/face_recognition/torchkit/backbone/model_irse.py
@@ -1,5 +1,5 @@
-# based on:
-# https://github.com/ZhaoJ9014/face.evoLVe.PyTorch/blob/master/backbone/model_irse.py
+# The implementation is adopted from TFace,made pubicly available under the Apache-2.0 license at
+# https://github.com/Tencent/TFace/blob/master/recognition/torchkit/backbone/model_irse.py
 from collections import namedtuple
 
 from torch.nn import (BatchNorm1d, BatchNorm2d, Conv2d, Dropout, Linear,
diff --git a/modelscope/models/cv/face_recognition/torchkit/backbone/model_resnet.py b/modelscope/models/cv/face_recognition/torchkit/backbone/model_resnet.py
index 7072f384..568e24ff 100755
--- a/modelscope/models/cv/face_recognition/torchkit/backbone/model_resnet.py
+++ b/modelscope/models/cv/face_recognition/torchkit/backbone/model_resnet.py
@@ -1,5 +1,5 @@
-# based on:
-# https://github.com/ZhaoJ9014/face.evoLVe.PyTorch/blob/master/backbone/model_resnet.py
+# The implementation is adopted from TFace,made pubicly available under the Apache-2.0 license at
+# https://github.com/Tencent/TFace/blob/master/recognition/torchkit/backbone/model_resnet.py
 import torch.nn as nn
 from torch.nn import (BatchNorm1d, BatchNorm2d, Conv2d, Dropout, Linear,
                       MaxPool2d, Module, ReLU, Sequential)
diff --git a/modelscope/models/cv/image_instance_segmentation/postprocess_utils.py b/modelscope/models/cv/image_instance_segmentation/postprocess_utils.py
index 43e52292..531e2efd 100644
--- a/modelscope/models/cv/image_instance_segmentation/postprocess_utils.py
+++ b/modelscope/models/cv/image_instance_segmentation/postprocess_utils.py
@@ -105,12 +105,12 @@ def get_img_ins_seg_result(img_seg_result=None,
     }
     for seg_result in img_seg_result:
 
-        box = {
-            'x': np.int(seg_result[0]),
-            'y': np.int(seg_result[1]),
-            'w': np.int(seg_result[2] - seg_result[0]),
-            'h': np.int(seg_result[3] - seg_result[1])
-        }
+        box = [
+            np.int(seg_result[0]),
+            np.int(seg_result[1]),
+            np.int(seg_result[2]),
+            np.int(seg_result[3])
+        ]
         score = np.float(seg_result[4])
         category = seg_result[5]
 
@@ -161,12 +161,10 @@ def show_result(
             np.random.random() * 255.0
         ])
 
-        x1 = int(box['x'])
-        y1 = int(box['y'])
-        w = int(box['w'])
-        h = int(box['h'])
-        x2 = x1 + w
-        y2 = y1 + h
+        x1 = int(box[0])
+        y1 = int(box[1])
+        x2 = int(box[2])
+        y2 = int(box[3])
 
         if show_box:
             cv2.rectangle(
diff --git a/modelscope/models/cv/image_reid_person/pass_model.py b/modelscope/models/cv/image_reid_person/pass_model.py
index 2222fedb..3b032949 100644
--- a/modelscope/models/cv/image_reid_person/pass_model.py
+++ b/modelscope/models/cv/image_reid_person/pass_model.py
@@ -1,4 +1,4 @@
-# The implementation is also open-sourced by the authors as PASS-reID, and is available publicly on
+# The implementation is adopted from PASS-reID, made pubicly available under the Apache-2.0 License at
 # https://github.com/CASIA-IVA-Lab/PASS-reID
 
 import os
diff --git a/modelscope/models/cv/image_reid_person/transreid_model.py b/modelscope/models/cv/image_reid_person/transreid_model.py
index 275c4e22..5bceb468 100644
--- a/modelscope/models/cv/image_reid_person/transreid_model.py
+++ b/modelscope/models/cv/image_reid_person/transreid_model.py
@@ -1,4 +1,4 @@
-# The implementation is also open-sourced by the authors as PASS-reID, and is available publicly on
+# The implementation is adopted from PASS-reID, made pubicly available under the Apache-2.0 License at
 # https://github.com/CASIA-IVA-Lab/PASS-reID
 
 import collections.abc as container_abcs
diff --git a/modelscope/models/cv/shop_segmentation/models.py b/modelscope/models/cv/shop_segmentation/models.py
index 8b82d1d1..171aafbd 100644
--- a/modelscope/models/cv/shop_segmentation/models.py
+++ b/modelscope/models/cv/shop_segmentation/models.py
@@ -552,7 +552,7 @@ class CLIPVisionTransformer(nn.Module):
                 nn.GroupNorm(1, embed_dim),
                 nn.ConvTranspose2d(
                     embed_dim, embed_dim, kernel_size=2, stride=2),
-                nn.SyncBatchNorm(embed_dim),
+                nn.BatchNorm2d(embed_dim),
                 nn.GELU(),
                 nn.ConvTranspose2d(
                     embed_dim, embed_dim, kernel_size=2, stride=2),
diff --git a/modelscope/models/cv/shop_segmentation/shop_seg_model.py b/modelscope/models/cv/shop_segmentation/shop_seg_model.py
index 409c583b..0aeeb1de 100644
--- a/modelscope/models/cv/shop_segmentation/shop_seg_model.py
+++ b/modelscope/models/cv/shop_segmentation/shop_seg_model.py
@@ -33,18 +33,18 @@ class ShopSegmentation(TorchModel):
             model_dir=model_dir, device_id=device_id, *args, **kwargs)
 
         self.model = SHOPSEG(model_dir=model_dir)
-        pretrained_params = torch.load('{}/{}'.format(
-            model_dir, ModelFile.TORCH_MODEL_BIN_FILE))
-
+        pretrained_params = torch.load(
+            '{}/{}'.format(model_dir, ModelFile.TORCH_MODEL_BIN_FILE),
+            map_location='cpu')
         self.model.load_state_dict(pretrained_params)
         self.model.eval()
-        self.device_id = device_id
-        if self.device_id >= 0 and torch.cuda.is_available():
-            self.model.to('cuda:{}'.format(self.device_id))
-            logger.info('Use GPU: {}'.format(self.device_id))
+        if device_id >= 0 and torch.cuda.is_available():
+            self.model.to('cuda:{}'.format(device_id))
+            logger.info('Use GPU: {}'.format(device_id))
         else:
-            self.device_id = -1
+            device_id = -1
             logger.info('Use CPU for inference')
+        self.device_id = device_id
 
     def preprocess(self, img, size=1024):
         mean = [0.48145466, 0.4578275, 0.40821073]
diff --git a/modelscope/models/cv/video_inpainting/__init__.py b/modelscope/models/cv/video_inpainting/__init__.py
new file mode 100644
index 00000000..f5489da9
--- /dev/null
+++ b/modelscope/models/cv/video_inpainting/__init__.py
@@ -0,0 +1,20 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .inpainting_model import VideoInpainting
+
+else:
+    _import_structure = {'inpainting_model': ['VideoInpainting']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/video_inpainting/inpainting.py b/modelscope/models/cv/video_inpainting/inpainting.py
new file mode 100644
index 00000000..e2af2ad0
--- /dev/null
+++ b/modelscope/models/cv/video_inpainting/inpainting.py
@@ -0,0 +1,299 @@
+""" VideoInpaintingProcess
+The implementation here is modified based on STTN,
+originally Apache 2.0 License and publicly avaialbe at https://github.com/researchmm/STTN
+"""
+
+import os
+import time
+
+import cv2
+import numpy as np
+import torch
+from PIL import Image
+from torchvision import transforms
+
+torch.backends.cudnn.enabled = False
+
+w, h = 192, 96
+ref_length = 300
+neighbor_stride = 20
+default_fps = 24
+MAX_frame = 300
+
+
+def video_process(video_input_path):
+    video_input = cv2.VideoCapture(video_input_path)
+    success, frame = video_input.read()
+    if success is False:
+        decode_error = 'decode_error'
+        w, h, fps = 0, 0, 0
+    else:
+        decode_error = None
+        h, w = frame.shape[0:2]
+        fps = video_input.get(cv2.CAP_PROP_FPS)
+    video_input.release()
+
+    return decode_error, fps, w, h
+
+
+class Stack(object):
+
+    def __init__(self, roll=False):
+        self.roll = roll
+
+    def __call__(self, img_group):
+        mode = img_group[0].mode
+        if mode == '1':
+            img_group = [img.convert('L') for img in img_group]
+            mode = 'L'
+        if mode == 'L':
+            return np.stack([np.expand_dims(x, 2) for x in img_group], axis=2)
+        elif mode == 'RGB':
+            if self.roll:
+                return np.stack([np.array(x)[:, :, ::-1] for x in img_group],
+                                axis=2)
+            else:
+                return np.stack(img_group, axis=2)
+        else:
+            raise NotImplementedError(f'Image mode {mode}')
+
+
+class ToTorchFormatTensor(object):
+    """ Converts a PIL.Image (RGB) or numpy.ndarray (H x W x C) in the range [0, 255]
+    to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0] """
+
+    def __init__(self, div=True):
+        self.div = div
+
+    def __call__(self, pic):
+        if isinstance(pic, np.ndarray):
+            img = torch.from_numpy(pic).permute(2, 3, 0, 1).contiguous()
+        else:
+            img = torch.ByteTensor(
+                torch.ByteStorage.from_buffer(pic.tobytes()))
+            img = img.view(pic.size[1], pic.size[0], len(pic.mode))
+            img = img.transpose(0, 1).transpose(0, 2).contiguous()
+        img = img.float().div(255) if self.div else img.float()
+        return img
+
+
+_to_tensors = transforms.Compose([Stack(), ToTorchFormatTensor()])
+
+
+def get_crop_mask_v1(mask):
+    orig_h, orig_w, _ = mask.shape
+    if (mask == 255).all():
+        return mask, (0, int(orig_h), 0,
+                      int(orig_w)), [0, int(orig_h), 0,
+                                     int(orig_w)
+                                     ], [0, int(orig_h), 0,
+                                         int(orig_w)]
+
+    hs = np.min(np.where(mask == 0)[0])
+    he = np.max(np.where(mask == 0)[0])
+    ws = np.min(np.where(mask == 0)[1])
+    we = np.max(np.where(mask == 0)[1])
+    crop_box = [ws, hs, we, he]
+
+    mask_h = round(int(orig_h / 2) / 4) * 4
+    mask_w = round(int(orig_w / 2) / 4) * 4
+
+    if (hs < mask_h) and (he < mask_h) and (ws < mask_w) and (we < mask_w):
+        crop_mask = mask[:mask_h, :mask_w, :]
+        res_pix = (0, mask_h, 0, mask_w)
+    elif (hs < mask_h) and (he < mask_h) and (ws > mask_w) and (we > mask_w):
+        crop_mask = mask[:mask_h, orig_w - mask_w:orig_w, :]
+        res_pix = (0, mask_h, orig_w - mask_w, int(orig_w))
+    elif (hs > mask_h) and (he > mask_h) and (ws < mask_w) and (we < mask_w):
+        crop_mask = mask[orig_h - mask_h:orig_h, :mask_w, :]
+        res_pix = (orig_h - mask_h, int(orig_h), 0, mask_w)
+    elif (hs > mask_h) and (he > mask_h) and (ws > mask_w) and (we > mask_w):
+        crop_mask = mask[orig_h - mask_h:orig_h, orig_w - mask_w:orig_w, :]
+        res_pix = (orig_h - mask_h, int(orig_h), orig_w - mask_w, int(orig_w))
+
+    elif (hs < mask_h) and (he < mask_h) and (ws < mask_w) and (we > mask_w):
+        crop_mask = mask[:mask_h, :, :]
+        res_pix = (0, mask_h, 0, int(orig_w))
+    elif (hs < mask_h) and (he > mask_h) and (ws < mask_w) and (we < mask_w):
+        crop_mask = mask[:, :mask_w, :]
+        res_pix = (0, int(orig_h), 0, mask_w)
+    elif (hs > mask_h) and (he > mask_h) and (ws < mask_w) and (we > mask_w):
+        crop_mask = mask[orig_h - mask_h:orig_h, :, :]
+        res_pix = (orig_h - mask_h, int(orig_h), 0, int(orig_w))
+    elif (hs < mask_h) and (he > mask_h) and (ws > mask_w) and (we > mask_w):
+        crop_mask = mask[:, orig_w - mask_w:orig_w, :]
+        res_pix = (0, int(orig_h), orig_w - mask_w, int(orig_w))
+    else:
+        crop_mask = mask
+        res_pix = (0, int(orig_h), 0, int(orig_w))
+    a = ws - res_pix[2]
+    b = hs - res_pix[0]
+    c = we - res_pix[2]
+    d = he - res_pix[0]
+    return crop_mask, res_pix, crop_box, [a, b, c, d]
+
+
+def get_ref_index(neighbor_ids, length):
+    ref_index = []
+    for i in range(0, length, ref_length):
+        if i not in neighbor_ids:
+            ref_index.append(i)
+    return ref_index
+
+
+def read_mask_oneImage(mpath):
+    masks = []
+    print('mask_path: {}'.format(mpath))
+    start = int(mpath.split('/')[-1].split('mask_')[1].split('_')[0])
+    end = int(
+        mpath.split('/')[-1].split('mask_')[1].split('_')[1].split('.')[0])
+    m = Image.open(mpath)
+    m = np.array(m.convert('L'))
+    m = np.array(m > 0).astype(np.uint8)
+    m = 1 - m
+    for i in range(start - 1, end + 1):
+        masks.append(Image.fromarray(m * 255))
+    return masks
+
+
+def check_size(h, w):
+    is_resize = False
+    if h != 240:
+        h = 240
+        is_resize = True
+    if w != 432:
+        w = 432
+        is_resize = True
+    return is_resize
+
+
+def get_mask_list(mask_path):
+    mask_names = os.listdir(mask_path)
+    mask_names.sort()
+
+    abs_mask_path = []
+    mask_list = []
+    begin_list = []
+    end_list = []
+
+    for mask_name in mask_names:
+        mask_name_tmp = mask_name.split('mask_')[1]
+        begin_list.append(int(mask_name_tmp.split('_')[0]))
+        end_list.append(int(mask_name_tmp.split('_')[1].split('.')[0]))
+        abs_mask_path.append(os.path.join(mask_path, mask_name))
+        mask = cv2.imread(os.path.join(mask_path, mask_name))
+        mask_list.append(mask)
+    return mask_list, begin_list, end_list, abs_mask_path
+
+
+def inpainting_by_model_balance(model, video_inputPath, mask_path,
+                                video_savePath, fps, w_ori, h_ori):
+
+    video_ori = cv2.VideoCapture(video_inputPath)
+
+    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+    video_save = cv2.VideoWriter(video_savePath, fourcc, fps, (w_ori, h_ori))
+
+    mask_list, begin_list, end_list, abs_mask_path = get_mask_list(mask_path)
+
+    img_npy = []
+
+    for index, mask in enumerate(mask_list):
+
+        masks = read_mask_oneImage(abs_mask_path[index])
+
+        mask, res_pix, crop_for_oriimg, crop_for_inpimg = get_crop_mask_v1(
+            mask)
+        mask_h, mask_w = mask.shape[0:2]
+        is_resize = check_size(mask.shape[0], mask.shape[1])
+
+        begin = begin_list[index]
+        end = end_list[index]
+        print('begin: {}'.format(begin))
+        print('end: {}'.format(end))
+
+        for i in range(begin, end + 1, MAX_frame):
+            begin_time = time.time()
+            if i + MAX_frame <= end:
+                video_length = MAX_frame
+            else:
+                video_length = end - i + 1
+
+            for frame_count in range(video_length):
+                _, frame = video_ori.read()
+                img_npy.append(frame)
+            frames_temp = []
+            for f in img_npy:
+                f = Image.fromarray(f)
+                i_temp = f.crop(
+                    (res_pix[2], res_pix[0], res_pix[3], res_pix[1]))
+                a = i_temp.resize((w, h), Image.NEAREST)
+                frames_temp.append(a)
+            feats_temp = _to_tensors(frames_temp).unsqueeze(0) * 2 - 1
+            frames_temp = [np.array(f).astype(np.uint8) for f in frames_temp]
+            masks_temp = []
+            for m in masks[i - begin:i + video_length - begin]:
+
+                m_temp = m.crop(
+                    (res_pix[2], res_pix[0], res_pix[3], res_pix[1]))
+                b = m_temp.resize((w, h), Image.NEAREST)
+                masks_temp.append(b)
+            binary_masks_temp = [
+                np.expand_dims((np.array(m) != 0).astype(np.uint8), 2)
+                for m in masks_temp
+            ]
+            masks_temp = _to_tensors(masks_temp).unsqueeze(0)
+            if torch.cuda.is_available():
+                feats_temp, masks_temp = feats_temp.cuda(), masks_temp.cuda()
+            comp_frames = [None] * video_length
+            model.eval()
+            with torch.no_grad():
+                feats_out = feats_temp * (1 - masks_temp).float()
+                feats_out = feats_out.view(video_length, 3, h, w)
+                feats_out = model.model.encoder(feats_out)
+                _, c, feat_h, feat_w = feats_out.size()
+                feats_out = feats_out.view(1, video_length, c, feat_h, feat_w)
+
+            for f in range(0, video_length, neighbor_stride):
+                neighbor_ids = [
+                    i for i in range(
+                        max(0, f - neighbor_stride),
+                        min(video_length, f + neighbor_stride + 1))
+                ]
+                ref_ids = get_ref_index(neighbor_ids, video_length)
+                with torch.no_grad():
+                    pred_feat = model.model.infer(
+                        feats_out[0, neighbor_ids + ref_ids, :, :, :],
+                        masks_temp[0, neighbor_ids + ref_ids, :, :, :])
+                    pred_img = torch.tanh(
+                        model.model.decoder(
+                            pred_feat[:len(neighbor_ids), :, :, :])).detach()
+                    pred_img = (pred_img + 1) / 2
+                    pred_img = pred_img.cpu().permute(0, 2, 3, 1).numpy() * 255
+                    for j in range(len(neighbor_ids)):
+                        idx = neighbor_ids[j]
+                        img = np.array(pred_img[j]).astype(
+                            np.uint8) * binary_masks_temp[idx] + frames_temp[
+                                idx] * (1 - binary_masks_temp[idx])
+                        if comp_frames[idx] is None:
+                            comp_frames[idx] = img
+                        else:
+                            comp_frames[idx] = comp_frames[idx].astype(
+                                np.float32) * 0.5 + img.astype(
+                                    np.float32) * 0.5
+            print('inpainting time:', time.time() - begin_time)
+            for f in range(video_length):
+                comp = np.array(comp_frames[f]).astype(
+                    np.uint8) * binary_masks_temp[f] + frames_temp[f] * (
+                        1 - binary_masks_temp[f])
+                if is_resize:
+                    comp = cv2.resize(comp, (mask_w, mask_h))
+                complete_frame = img_npy[f]
+                a1, b1, c1, d1 = crop_for_oriimg
+                a2, b2, c2, d2 = crop_for_inpimg
+                complete_frame[b1:d1, a1:c1] = comp[b2:d2, a2:c2]
+                video_save.write(complete_frame)
+
+            img_npy = []
+
+    video_ori.release()
diff --git a/modelscope/models/cv/video_inpainting/inpainting_model.py b/modelscope/models/cv/video_inpainting/inpainting_model.py
new file mode 100644
index 00000000..ffecde67
--- /dev/null
+++ b/modelscope/models/cv/video_inpainting/inpainting_model.py
@@ -0,0 +1,381 @@
+""" VideoInpaintingProcess
+The implementation here is modified based on STTN,
+ originally Apache 2.0 License and publicly avaialbe at https://github.com/researchmm/STTN
+"""
+
+import math
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision.models as models
+
+from modelscope.metainfo import Models
+from modelscope.models import Model
+from modelscope.models.base import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+class BaseNetwork(nn.Module):
+
+    def __init__(self):
+        super(BaseNetwork, self).__init__()
+
+    def print_network(self):
+        if isinstance(self, list):
+            self = self[0]
+        num_params = 0
+        for param in self.parameters():
+            num_params += param.numel()
+        print(
+            'Network [%s] was created. Total number of parameters: %.1f million. '
+            'To see the architecture, do print(network).' %
+            (type(self).__name__, num_params / 1000000))
+
+    def init_weights(self, init_type='normal', gain=0.02):
+        '''
+        initialize network's weights
+        init_type: normal | xavier | kaiming | orthogonal
+        https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix/blob/9451e70673400885567d08a9e97ade2524c700d0/models/networks.py#L39
+        '''
+
+        def init_func(m):
+            classname = m.__class__.__name__
+            if classname.find('InstanceNorm2d') != -1:
+                if hasattr(m, 'weight') and m.weight is not None:
+                    nn.init.constant_(m.weight.data, 1.0)
+                if hasattr(m, 'bias') and m.bias is not None:
+                    nn.init.constant_(m.bias.data, 0.0)
+            elif hasattr(m, 'weight') and (classname.find('Conv') != -1
+                                           or classname.find('Linear') != -1):
+                if init_type == 'normal':
+                    nn.init.normal_(m.weight.data, 0.0, gain)
+                elif init_type == 'xavier':
+                    nn.init.xavier_normal_(m.weight.data, gain=gain)
+                elif init_type == 'xavier_uniform':
+                    nn.init.xavier_uniform_(m.weight.data, gain=1.0)
+                elif init_type == 'kaiming':
+                    nn.init.kaiming_normal_(m.weight.data, a=0, mode='fan_in')
+                elif init_type == 'orthogonal':
+                    nn.init.orthogonal_(m.weight.data, gain=gain)
+                elif init_type == 'none':
+                    m.reset_parameters()
+                else:
+                    raise NotImplementedError(
+                        'initialization method [%s] is not implemented'
+                        % init_type)
+                if hasattr(m, 'bias') and m.bias is not None:
+                    nn.init.constant_(m.bias.data, 0.0)
+
+        self.apply(init_func)
+
+        for m in self.children():
+            if hasattr(m, 'init_weights'):
+                m.init_weights(init_type, gain)
+
+
+@MODELS.register_module(
+    Tasks.video_inpainting, module_name=Models.video_inpainting)
+class VideoInpainting(TorchModel):
+
+    def __init__(self, model_dir, device_id=0, *args, **kwargs):
+        super().__init__(
+            model_dir=model_dir, device_id=device_id, *args, **kwargs)
+        self.model = InpaintGenerator()
+        if torch.cuda.is_available():
+            device = 'cuda'
+        else:
+            device = 'cpu'
+        pretrained_params = torch.load(
+            '{}/{}'.format(model_dir, ModelFile.TORCH_MODEL_BIN_FILE),
+            map_location=device)
+        self.model.load_state_dict(pretrained_params['netG'])
+        self.model.eval()
+        self.device_id = device_id
+        if self.device_id >= 0 and torch.cuda.is_available():
+            self.model.to('cuda:{}'.format(self.device_id))
+            logger.info('Use GPU: {}'.format(self.device_id))
+        else:
+            self.device_id = -1
+            logger.info('Use CPU for inference')
+
+
+class InpaintGenerator(BaseNetwork):
+
+    def __init__(self, init_weights=True):
+        super(InpaintGenerator, self).__init__()
+        channel = 256
+        stack_num = 6
+        patchsize = [(48, 24), (16, 8), (8, 4), (4, 2)]
+        blocks = []
+        for _ in range(stack_num):
+            blocks.append(TransformerBlock(patchsize, hidden=channel))
+        self.transformer = nn.Sequential(*blocks)
+
+        self.encoder = nn.Sequential(
+            nn.Conv2d(3, 64, kernel_size=3, stride=2, padding=1),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(64, 128, kernel_size=3, stride=2, padding=1),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(128, channel, kernel_size=3, stride=1, padding=1),
+            nn.LeakyReLU(0.2, inplace=True),
+        )
+
+        self.decoder = nn.Sequential(
+            deconv(channel, 128, kernel_size=3, padding=1),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(128, 64, kernel_size=3, stride=1, padding=1),
+            nn.LeakyReLU(0.2, inplace=True),
+            deconv(64, 64, kernel_size=3, padding=1),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(64, 3, kernel_size=3, stride=1, padding=1))
+
+        if init_weights:
+            self.init_weights()
+
+    def forward(self, masked_frames, masks):
+        b, t, c, h, w = masked_frames.size()
+        masks = masks.view(b * t, 1, h, w)
+        enc_feat = self.encoder(masked_frames.view(b * t, c, h, w))
+        _, c, h, w = enc_feat.size()
+        masks = F.interpolate(masks, scale_factor=1.0 / 4)
+        enc_feat = self.transformer({
+            'x': enc_feat,
+            'm': masks,
+            'b': b,
+            'c': c
+        })['x']
+        output = self.decoder(enc_feat)
+        output = torch.tanh(output)
+        return output
+
+    def infer(self, feat, masks):
+        t, c, h, w = masks.size()
+        masks = masks.view(t, c, h, w)
+        masks = F.interpolate(masks, scale_factor=1.0 / 4)
+        t, c, _, _ = feat.size()
+        enc_feat = self.transformer({
+            'x': feat,
+            'm': masks,
+            'b': 1,
+            'c': c
+        })['x']
+        return enc_feat
+
+
+class deconv(nn.Module):
+
+    def __init__(self,
+                 input_channel,
+                 output_channel,
+                 kernel_size=3,
+                 padding=0):
+        super().__init__()
+        self.conv = nn.Conv2d(
+            input_channel,
+            output_channel,
+            kernel_size=kernel_size,
+            stride=1,
+            padding=padding)
+
+    def forward(self, x):
+        x = F.interpolate(
+            x, scale_factor=2, mode='bilinear', align_corners=True)
+        x = self.conv(x)
+        return x
+
+
+class Attention(nn.Module):
+    """
+    Compute 'Scaled Dot Product Attention
+    """
+
+    def forward(self, query, key, value, m):
+        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(
+            query.size(-1))
+        scores.masked_fill(m, -1e9)
+        p_attn = F.softmax(scores, dim=-1)
+        p_val = torch.matmul(p_attn, value)
+        return p_val, p_attn
+
+
+class MultiHeadedAttention(nn.Module):
+    """
+    Take in model size and number of heads.
+    """
+
+    def __init__(self, patchsize, d_model):
+        super().__init__()
+        self.patchsize = patchsize
+        self.query_embedding = nn.Conv2d(
+            d_model, d_model, kernel_size=1, padding=0)
+        self.value_embedding = nn.Conv2d(
+            d_model, d_model, kernel_size=1, padding=0)
+        self.key_embedding = nn.Conv2d(
+            d_model, d_model, kernel_size=1, padding=0)
+        self.output_linear = nn.Sequential(
+            nn.Conv2d(d_model, d_model, kernel_size=3, padding=1),
+            nn.LeakyReLU(0.2, inplace=True))
+        self.attention = Attention()
+
+    def forward(self, x, m, b, c):
+        bt, _, h, w = x.size()
+        t = bt // b
+        d_k = c // len(self.patchsize)
+        output = []
+        _query = self.query_embedding(x)
+        _key = self.key_embedding(x)
+        _value = self.value_embedding(x)
+        for (width, height), query, key, value in zip(
+                self.patchsize,
+                torch.chunk(_query, len(self.patchsize), dim=1),
+                torch.chunk(_key, len(self.patchsize), dim=1),
+                torch.chunk(_value, len(self.patchsize), dim=1)):
+            out_w, out_h = w // width, h // height
+            mm = m.view(b, t, 1, out_h, height, out_w, width)
+            mm = mm.permute(0, 1, 3, 5, 2, 4,
+                            6).contiguous().view(b, t * out_h * out_w,
+                                                 height * width)
+            mm = (mm.mean(-1) > 0.5).unsqueeze(1).repeat(
+                1, t * out_h * out_w, 1)
+            query = query.view(b, t, d_k, out_h, height, out_w, width)
+            query = query.permute(0, 1, 3, 5, 2, 4,
+                                  6).contiguous().view(b, t * out_h * out_w,
+                                                       d_k * height * width)
+            key = key.view(b, t, d_k, out_h, height, out_w, width)
+            key = key.permute(0, 1, 3, 5, 2, 4,
+                              6).contiguous().view(b, t * out_h * out_w,
+                                                   d_k * height * width)
+            value = value.view(b, t, d_k, out_h, height, out_w, width)
+            value = value.permute(0, 1, 3, 5, 2, 4,
+                                  6).contiguous().view(b, t * out_h * out_w,
+                                                       d_k * height * width)
+            y, _ = self.attention(query, key, value, mm)
+            y = y.view(b, t, out_h, out_w, d_k, height, width)
+            y = y.permute(0, 1, 4, 2, 5, 3, 6).contiguous().view(bt, d_k, h, w)
+            output.append(y)
+        output = torch.cat(output, 1)
+        x = self.output_linear(output)
+        return x
+
+
+class FeedForward(nn.Module):
+
+    def __init__(self, d_model):
+        super(FeedForward, self).__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2d(d_model, d_model, kernel_size=3, padding=2, dilation=2),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(d_model, d_model, kernel_size=3, padding=1),
+            nn.LeakyReLU(0.2, inplace=True))
+
+    def forward(self, x):
+        x = self.conv(x)
+        return x
+
+
+class TransformerBlock(nn.Module):
+    """
+    Transformer = MultiHead_Attention + Feed_Forward with sublayer connection
+    """
+
+    def __init__(self, patchsize, hidden=128):  # hidden=128
+        super().__init__()
+        self.attention = MultiHeadedAttention(patchsize, d_model=hidden)
+        self.feed_forward = FeedForward(hidden)
+
+    def forward(self, x):
+        x, m, b, c = x['x'], x['m'], x['b'], x['c']
+        x = x + self.attention(x, m, b, c)
+        x = x + self.feed_forward(x)
+        return {'x': x, 'm': m, 'b': b, 'c': c}
+
+
+class Discriminator(BaseNetwork):
+
+    def __init__(self,
+                 in_channels=3,
+                 use_sigmoid=False,
+                 use_spectral_norm=True,
+                 init_weights=True):
+        super(Discriminator, self).__init__()
+        self.use_sigmoid = use_sigmoid
+        nf = 64
+
+        self.conv = nn.Sequential(
+            spectral_norm(
+                nn.Conv3d(
+                    in_channels=in_channels,
+                    out_channels=nf * 1,
+                    kernel_size=(3, 5, 5),
+                    stride=(1, 2, 2),
+                    padding=1,
+                    bias=not use_spectral_norm), use_spectral_norm),
+            nn.LeakyReLU(0.2, inplace=True),
+            spectral_norm(
+                nn.Conv3d(
+                    nf * 1,
+                    nf * 2,
+                    kernel_size=(3, 5, 5),
+                    stride=(1, 2, 2),
+                    padding=(1, 2, 2),
+                    bias=not use_spectral_norm), use_spectral_norm),
+            nn.LeakyReLU(0.2, inplace=True),
+            spectral_norm(
+                nn.Conv3d(
+                    nf * 2,
+                    nf * 4,
+                    kernel_size=(3, 5, 5),
+                    stride=(1, 2, 2),
+                    padding=(1, 2, 2),
+                    bias=not use_spectral_norm), use_spectral_norm),
+            nn.LeakyReLU(0.2, inplace=True),
+            spectral_norm(
+                nn.Conv3d(
+                    nf * 4,
+                    nf * 4,
+                    kernel_size=(3, 5, 5),
+                    stride=(1, 2, 2),
+                    padding=(1, 2, 2),
+                    bias=not use_spectral_norm), use_spectral_norm),
+            nn.LeakyReLU(0.2, inplace=True),
+            spectral_norm(
+                nn.Conv3d(
+                    nf * 4,
+                    nf * 4,
+                    kernel_size=(3, 5, 5),
+                    stride=(1, 2, 2),
+                    padding=(1, 2, 2),
+                    bias=not use_spectral_norm), use_spectral_norm),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv3d(
+                nf * 4,
+                nf * 4,
+                kernel_size=(3, 5, 5),
+                stride=(1, 2, 2),
+                padding=(1, 2, 2)))
+
+        if init_weights:
+            self.init_weights()
+
+    def forward(self, xs):
+        xs_t = torch.transpose(xs, 0, 1)
+        xs_t = xs_t.unsqueeze(0)
+        feat = self.conv(xs_t)
+        if self.use_sigmoid:
+            feat = torch.sigmoid(feat)
+        out = torch.transpose(feat, 1, 2)
+        return out
+
+
+def spectral_norm(module, mode=True):
+    if mode:
+        return _spectral_norm(module)
+    return module
diff --git a/modelscope/models/cv/video_single_object_tracking/config/ostrack.py b/modelscope/models/cv/video_single_object_tracking/config/ostrack.py
index 8be07928..6805c503 100644
--- a/modelscope/models/cv/video_single_object_tracking/config/ostrack.py
+++ b/modelscope/models/cv/video_single_object_tracking/config/ostrack.py
@@ -1,4 +1,5 @@
-# The implementation is based on OSTrack, available at https://github.com/botaoye/OSTrack/
+# The implementation is adopted from OSTrack,
+# made publicly available under the MIT License at https://github.com/botaoye/OSTrack/
 from easydict import EasyDict as edict
 
 cfg = edict()
diff --git a/modelscope/models/cv/video_single_object_tracking/models/layers/attn.py b/modelscope/models/cv/video_single_object_tracking/models/layers/attn.py
index 00eb7e1c..e245c821 100644
--- a/modelscope/models/cv/video_single_object_tracking/models/layers/attn.py
+++ b/modelscope/models/cv/video_single_object_tracking/models/layers/attn.py
@@ -1,4 +1,5 @@
-# The implementation is based on OSTrack, available at https://github.com/botaoye/OSTrack/
+# The implementation is adopted from OSTrack,
+# made publicly available under the MIT License at https://github.com/botaoye/OSTrack/
 import torch.nn as nn
 
 
diff --git a/modelscope/models/cv/video_single_object_tracking/models/layers/attn_blocks.py b/modelscope/models/cv/video_single_object_tracking/models/layers/attn_blocks.py
index 3505d5e1..702c84f1 100644
--- a/modelscope/models/cv/video_single_object_tracking/models/layers/attn_blocks.py
+++ b/modelscope/models/cv/video_single_object_tracking/models/layers/attn_blocks.py
@@ -1,4 +1,5 @@
-# The implementation is based on OSTrack, available at https://github.com/botaoye/OSTrack/
+# The implementation is adopted from OSTrack,
+# made publicly available under the MIT License at https://github.com/botaoye/OSTrack/
 import math
 
 import torch
diff --git a/modelscope/models/cv/video_single_object_tracking/models/layers/head.py b/modelscope/models/cv/video_single_object_tracking/models/layers/head.py
index 77706dbc..e0dc7b59 100644
--- a/modelscope/models/cv/video_single_object_tracking/models/layers/head.py
+++ b/modelscope/models/cv/video_single_object_tracking/models/layers/head.py
@@ -1,4 +1,5 @@
-# The implementation is based on OSTrack, available at https://github.com/botaoye/OSTrack/
+# The implementation is adopted from OSTrack,
+# made publicly available under the MIT License at https://github.com/botaoye/OSTrack/
 import torch
 import torch.nn as nn
 
diff --git a/modelscope/models/cv/video_single_object_tracking/models/layers/patch_embed.py b/modelscope/models/cv/video_single_object_tracking/models/layers/patch_embed.py
index b1099fdf..c001663f 100644
--- a/modelscope/models/cv/video_single_object_tracking/models/layers/patch_embed.py
+++ b/modelscope/models/cv/video_single_object_tracking/models/layers/patch_embed.py
@@ -1,4 +1,5 @@
-# The implementation is based on OSTrack, available at https://github.com/botaoye/OSTrack/
+# The implementation is adopted from OSTrack,
+# made publicly available under the MIT License at https://github.com/botaoye/OSTrack/
 import torch.nn as nn
 from timm.models.layers import to_2tuple
 
diff --git a/modelscope/models/cv/video_single_object_tracking/models/ostrack/base_backbone.py b/modelscope/models/cv/video_single_object_tracking/models/ostrack/base_backbone.py
index de3a7b83..20d73422 100644
--- a/modelscope/models/cv/video_single_object_tracking/models/ostrack/base_backbone.py
+++ b/modelscope/models/cv/video_single_object_tracking/models/ostrack/base_backbone.py
@@ -1,4 +1,5 @@
-# The implementation is based on OSTrack, available at https://github.com/botaoye/OSTrack/
+# The implementation is adopted from OSTrack,
+# made publicly available under the MIT License at https://github.com/botaoye/OSTrack/
 import torch.nn as nn
 from timm.models.layers import to_2tuple
 
diff --git a/modelscope/models/cv/video_single_object_tracking/models/ostrack/ostrack.py b/modelscope/models/cv/video_single_object_tracking/models/ostrack/ostrack.py
index 40ed54f1..52704a6c 100644
--- a/modelscope/models/cv/video_single_object_tracking/models/ostrack/ostrack.py
+++ b/modelscope/models/cv/video_single_object_tracking/models/ostrack/ostrack.py
@@ -1,4 +1,5 @@
-# The implementation is based on OSTrack, available at https://github.com/botaoye/OSTrack/
+# The implementation is adopted from OSTrack,
+# made publicly available under the MIT License at https://github.com/botaoye/OSTrack/
 import torch
 from torch import nn
 
diff --git a/modelscope/models/cv/video_single_object_tracking/models/ostrack/utils.py b/modelscope/models/cv/video_single_object_tracking/models/ostrack/utils.py
index e1130069..46e7c18a 100644
--- a/modelscope/models/cv/video_single_object_tracking/models/ostrack/utils.py
+++ b/modelscope/models/cv/video_single_object_tracking/models/ostrack/utils.py
@@ -1,4 +1,5 @@
-# The implementation is based on OSTrack, available at https://github.com/botaoye/OSTrack/
+# The implementation is adopted from OSTrack,
+# made publicly available under the MIT License at https://github.com/botaoye/OSTrack/
 import torch
 
 
diff --git a/modelscope/models/cv/video_single_object_tracking/models/ostrack/vit_ce.py b/modelscope/models/cv/video_single_object_tracking/models/ostrack/vit_ce.py
index 9f010332..f186cf89 100644
--- a/modelscope/models/cv/video_single_object_tracking/models/ostrack/vit_ce.py
+++ b/modelscope/models/cv/video_single_object_tracking/models/ostrack/vit_ce.py
@@ -1,4 +1,5 @@
-# The implementation is based on OSTrack, available at https://github.com/botaoye/OSTrack/
+# The implementation is adopted from OSTrack,
+# made publicly available under the MIT License at https://github.com/botaoye/OSTrack/
 from functools import partial
 
 import torch
diff --git a/modelscope/models/cv/video_single_object_tracking/tracker/ostrack.py b/modelscope/models/cv/video_single_object_tracking/tracker/ostrack.py
index 02f4c79e..5093a72d 100644
--- a/modelscope/models/cv/video_single_object_tracking/tracker/ostrack.py
+++ b/modelscope/models/cv/video_single_object_tracking/tracker/ostrack.py
@@ -1,4 +1,5 @@
-# The implementation is based on OSTrack, available at https://github.com/botaoye/OSTrack/
+# The implementation is adopted from OSTrack,
+# made publicly available under the MIT License at https://github.com/botaoye/OSTrack/
 import torch
 
 from modelscope.models.cv.video_single_object_tracking.config.ostrack import \
diff --git a/modelscope/models/cv/video_single_object_tracking/utils/utils.py b/modelscope/models/cv/video_single_object_tracking/utils/utils.py
index 51911957..752ec272 100644
--- a/modelscope/models/cv/video_single_object_tracking/utils/utils.py
+++ b/modelscope/models/cv/video_single_object_tracking/utils/utils.py
@@ -1,4 +1,5 @@
-# The implementation is based on OSTrack, available at https://github.com/botaoye/OSTrack/
+# The implementation is adopted from OSTrack,
+# made publicly available under the MIT License at https://github.com/botaoye/OSTrack/
 import math
 from typing import Optional
 
diff --git a/modelscope/models/multi_modal/__init__.py b/modelscope/models/multi_modal/__init__.py
index 9219a281..0053da43 100644
--- a/modelscope/models/multi_modal/__init__.py
+++ b/modelscope/models/multi_modal/__init__.py
@@ -14,6 +14,8 @@ if TYPE_CHECKING:
     from .ofa_for_all_tasks import OfaForAllTasks
     from .ofa_for_text_to_image_synthesis_model import \
         OfaForTextToImageSynthesis
+    from .multi_stage_diffusion import \
+        MultiStageDiffusionForTextToImageSynthesis
 
 else:
     _import_structure = {
@@ -25,7 +27,9 @@ else:
         'mplug_for_all_tasks': ['MPlugForAllTasks'],
         'ofa_for_all_tasks': ['OfaForAllTasks'],
         'ofa_for_text_to_image_synthesis_model':
-        ['OfaForTextToImageSynthesis']
+        ['OfaForTextToImageSynthesis'],
+        'multi_stage_diffusion':
+        ['MultiStageDiffusionForTextToImageSynthesis']
     }
 
     import sys
diff --git a/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py b/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py
index 4e959a17..8d13e745 100644
--- a/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py
+++ b/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py
@@ -42,7 +42,10 @@ class VideoCLIPForMultiModalEmbedding(TorchModel):
         self.max_frames = model_config['max_frames']
         self.feature_framerate = model_config['feature_framerate']
         self.image_resolution = 224
-        self.device = model_config['device']
+        if torch.cuda.is_available():
+            self.device = model_config['device']
+        else:
+            self.device = 'cpu'
         self.init_model = f'{model_dir}/{ModelFile.TORCH_MODEL_BIN_FILE}'
 
         self.tokenizer = ClipTokenizer(model_dir)
diff --git a/modelscope/models/multi_modal/mmr/models/modeling.py b/modelscope/models/multi_modal/mmr/models/modeling.py
index 214e65c7..21cc4c80 100644
--- a/modelscope/models/multi_modal/mmr/models/modeling.py
+++ b/modelscope/models/multi_modal/mmr/models/modeling.py
@@ -85,9 +85,6 @@ class CLIP4Clip(nn.Module):
             linear_patch=config['linear_patch'],
             use_gc=config['use_gc']).float()
 
-        if (platform.system() != 'Darwin'):
-            convert_weights(self.clip)  # fp16
-
         if backbone in ['ViT-B/32', 'ViT-B/16']:
             cross_config = SimpleNamespace(**{
                 'hidden_size': 512,
diff --git a/modelscope/models/multi_modal/mplug/modeling_mplug.py b/modelscope/models/multi_modal/mplug/modeling_mplug.py
index f469c218..ec491f1d 100755
--- a/modelscope/models/multi_modal/mplug/modeling_mplug.py
+++ b/modelscope/models/multi_modal/mplug/modeling_mplug.py
@@ -1868,6 +1868,8 @@ class MPlug(PreTrainedModel):
             checkpoint = torch.load(checkpoint_path, map_location='cpu')
             if 'model' in checkpoint:
                 checkpoint = checkpoint['model']
+            if 'module' in checkpoint:
+                checkpoint = checkpoint['module']
             checkpoint = {
                 k.replace('model.', ''): v
                 for k, v in checkpoint.items()
diff --git a/modelscope/models/multi_modal/mplug_for_all_tasks.py b/modelscope/models/multi_modal/mplug_for_all_tasks.py
index 608cc733..d61fea10 100644
--- a/modelscope/models/multi_modal/mplug_for_all_tasks.py
+++ b/modelscope/models/multi_modal/mplug_for_all_tasks.py
@@ -1,10 +1,13 @@
+import os.path as osp
 from typing import Dict, List
 
 from modelscope.metainfo import Models
 from modelscope.models import TorchModel
 from modelscope.models.base import Tensor
 from modelscope.models.builder import MODELS
-from modelscope.utils.constant import Tasks
+from modelscope.outputs import OutputKeys
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
 
 __all__ = ['MPlugForAllTasks']
 
@@ -44,17 +47,24 @@ class MPlugForAllTasks(TorchModel):
                                ('[unused1]', ''), (r' +', ' '), ('[SEP]', ''),
                                ('[unused2]', ''), ('[CLS]', ''), ('[UNK]', ''))
 
+        # get task from config file
+        task = Config.from_file(
+            osp.join(self.model_dir, ModelFile.CONFIGURATION)).task
+
         # inference
         if not self.training and 'question' in input:
             output = self.model(input['image'], input['question'], train=False)
-            if not isinstance(output, tuple):
-                return output
+            if task == Tasks.image_text_retrieval:
+                return {OutputKeys.SCORES: output[0].tolist()}
             topk_ids, _ = output
-            pred_string: str = self.tokenizer.decode(topk_ids[0][0])
+            pred_string: List[str] = \
+                self.tokenizer.decode(topk_ids[0][0])
             for _old, _new in replace_tokens_bert:
                 pred_string = pred_string.replace(_old, _new)
             pred_string = pred_string.strip()
-            return pred_string
+            output_key = OutputKeys.CAPTION \
+                if task == Tasks.image_captioning else OutputKeys.TEXT
+            return {output_key: pred_string}
 
         # train and evaluate
         import addict
@@ -71,7 +81,7 @@ class MPlugForAllTasks(TorchModel):
             index = input['index']
             output = self.model(image, answer, index, train=self.training)
         if self.training:
-            return {'loss': output}
+            return {OutputKeys.LOSS: output}
 
         # evaluate
         topk_ids, _ = output
diff --git a/modelscope/models/multi_modal/multi_stage_diffusion/__init__.py b/modelscope/models/multi_modal/multi_stage_diffusion/__init__.py
new file mode 100644
index 00000000..accbb56e
--- /dev/null
+++ b/modelscope/models/multi_modal/multi_stage_diffusion/__init__.py
@@ -0,0 +1 @@
+from .model import MultiStageDiffusionForTextToImageSynthesis
diff --git a/modelscope/models/multi_modal/multi_stage_diffusion/clip.py b/modelscope/models/multi_modal/multi_stage_diffusion/clip.py
new file mode 100644
index 00000000..54e971f7
--- /dev/null
+++ b/modelscope/models/multi_modal/multi_stage_diffusion/clip.py
@@ -0,0 +1,318 @@
+# The implementation here is modified based on OpenAI CLIP, publicly available at https://github.com/openai/CLIP.
+
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+__all__ = ['CLIP']
+
+
+def to_fp16(m):
+    if isinstance(m, (nn.Linear, nn.Conv2d)):
+        m.weight.data = m.weight.data.half()
+        if m.bias is not None:
+            m.bias.data = m.bias.data.half()
+    elif hasattr(m, 'head'):
+        p = getattr(m, 'head')
+        p.data = p.data.half()
+
+
+class QuickGELU(nn.Module):
+
+    def forward(self, x):
+        return x * torch.sigmoid(1.702 * x)
+
+
+class LayerNorm(nn.LayerNorm):
+    r"""Subclass of nn.LayerNorm to handle fp16.
+    """
+
+    def forward(self, x):
+        return super(LayerNorm, self).forward(x.float()).type_as(x)
+
+
+class SelfAttention(nn.Module):
+
+    def __init__(self, dim, num_heads, attn_dropout=0.0, proj_dropout=0.0):
+        assert dim % num_heads == 0
+        super(SelfAttention, self).__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = 1.0 / math.sqrt(self.head_dim)
+
+        # layers
+        self.to_qkv = nn.Linear(dim, dim * 3)
+        self.attn_dropout = nn.Dropout(attn_dropout)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_dropout = nn.Dropout(proj_dropout)
+
+    def forward(self, x, mask=None):
+        r"""x:      [B, L, C].
+            mask:   [*, L, L].
+        """
+        b, l, _, n = *x.size(), self.num_heads
+
+        # compute query, key, and value
+        q, k, v = self.to_qkv(x.transpose(0, 1)).chunk(3, dim=-1)
+        q = q.reshape(l, b * n, -1).transpose(0, 1)
+        k = k.reshape(l, b * n, -1).transpose(0, 1)
+        v = v.reshape(l, b * n, -1).transpose(0, 1)
+
+        # compute attention
+        attn = self.scale * torch.bmm(q, k.transpose(1, 2))
+        if mask is not None:
+            attn = attn.masked_fill(mask[:, :l, :l] == 0, float('-inf'))
+        attn = F.softmax(attn.float(), dim=-1).type_as(attn)
+        attn = self.attn_dropout(attn)
+
+        # gather context
+        x = torch.bmm(attn, v)
+        x = x.view(b, n, l, -1).transpose(1, 2).reshape(b, l, -1)
+
+        # output
+        x = self.proj(x)
+        x = self.proj_dropout(x)
+        return x
+
+
+class AttentionBlock(nn.Module):
+
+    def __init__(self, dim, num_heads, attn_dropout=0.0, proj_dropout=0.0):
+        super(AttentionBlock, self).__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+
+        # layers
+        self.norm1 = LayerNorm(dim)
+        self.attn = SelfAttention(dim, num_heads, attn_dropout, proj_dropout)
+        self.norm2 = LayerNorm(dim)
+        self.mlp = nn.Sequential(
+            nn.Linear(dim, dim * 4), QuickGELU(), nn.Linear(dim * 4, dim),
+            nn.Dropout(proj_dropout))
+
+    def forward(self, x, mask=None):
+        x = x + self.attn(self.norm1(x), mask)
+        x = x + self.mlp(self.norm2(x))
+        return x
+
+
+class VisionTransformer(nn.Module):
+
+    def __init__(self,
+                 image_size=224,
+                 patch_size=16,
+                 dim=768,
+                 out_dim=512,
+                 num_heads=12,
+                 num_layers=12,
+                 attn_dropout=0.0,
+                 proj_dropout=0.0,
+                 embedding_dropout=0.0):
+        assert image_size % patch_size == 0
+        super(VisionTransformer, self).__init__()
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.dim = dim
+        self.out_dim = out_dim
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.num_patches = (image_size // patch_size)**2
+
+        # embeddings
+        gain = 1.0 / math.sqrt(dim)
+        self.patch_embedding = nn.Conv2d(
+            3, dim, kernel_size=patch_size, stride=patch_size, bias=False)
+        self.cls_embedding = nn.Parameter(gain * torch.randn(1, 1, dim))
+        self.pos_embedding = nn.Parameter(
+            gain * torch.randn(1, self.num_patches + 1, dim))
+        self.dropout = nn.Dropout(embedding_dropout)
+
+        # transformer
+        self.pre_norm = LayerNorm(dim)
+        self.transformer = nn.Sequential(*[
+            AttentionBlock(dim, num_heads, attn_dropout, proj_dropout)
+            for _ in range(num_layers)
+        ])
+        self.post_norm = LayerNorm(dim)
+
+        # head
+        self.head = nn.Parameter(gain * torch.randn(dim, out_dim))
+
+    def forward(self, x):
+        b, dtype = x.size(0), self.head.dtype
+        x = x.type(dtype)
+
+        # patch-embedding
+        x = self.patch_embedding(x).flatten(2).permute(0, 2, 1)  # [b, n, c]
+        x = torch.cat([self.cls_embedding.repeat(b, 1, 1).type(dtype), x],
+                      dim=1)
+        x = self.dropout(x + self.pos_embedding.type(dtype))
+        x = self.pre_norm(x)
+
+        # transformer
+        x = self.transformer(x)
+
+        # head
+        x = self.post_norm(x)
+        x = torch.mm(x[:, 0, :], self.head)
+        return x
+
+    def fp16(self):
+        return self.apply(to_fp16)
+
+
+class TextTransformer(nn.Module):
+
+    def __init__(self,
+                 vocab_size,
+                 text_len,
+                 dim=512,
+                 out_dim=512,
+                 num_heads=8,
+                 num_layers=12,
+                 attn_dropout=0.0,
+                 proj_dropout=0.0,
+                 embedding_dropout=0.0):
+        super(TextTransformer, self).__init__()
+        self.vocab_size = vocab_size
+        self.text_len = text_len
+        self.dim = dim
+        self.out_dim = out_dim
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+
+        # embeddings
+        self.token_embedding = nn.Embedding(vocab_size, dim)
+        self.pos_embedding = nn.Parameter(0.01 * torch.randn(1, text_len, dim))
+        self.dropout = nn.Dropout(embedding_dropout)
+
+        # transformer
+        self.transformer = nn.ModuleList([
+            AttentionBlock(dim, num_heads, attn_dropout, proj_dropout)
+            for _ in range(num_layers)
+        ])
+        self.norm = LayerNorm(dim)
+
+        # head
+        gain = 1.0 / math.sqrt(dim)
+        self.head = nn.Parameter(gain * torch.randn(dim, out_dim))
+
+        # causal attention mask
+        self.register_buffer('attn_mask',
+                             torch.tril(torch.ones(1, text_len, text_len)))
+
+    def forward(self, x):
+        eot, dtype = x.argmax(dim=-1), self.head.dtype
+
+        # embeddings
+        x = self.dropout(
+            self.token_embedding(x).type(dtype)
+            + self.pos_embedding.type(dtype))
+
+        # transformer
+        for block in self.transformer:
+            x = block(x, self.attn_mask)
+
+        # head
+        x = self.norm(x)
+        x = torch.mm(x[torch.arange(x.size(0)), eot], self.head)
+        return x
+
+    def fp16(self):
+        return self.apply(to_fp16)
+
+
+class CLIP(nn.Module):
+
+    def __init__(self,
+                 embed_dim=512,
+                 image_size=224,
+                 patch_size=16,
+                 vision_dim=768,
+                 vision_heads=12,
+                 vision_layers=12,
+                 vocab_size=49408,
+                 text_len=77,
+                 text_dim=512,
+                 text_heads=8,
+                 text_layers=12,
+                 attn_dropout=0.0,
+                 proj_dropout=0.0,
+                 embedding_dropout=0.0):
+        super(CLIP, self).__init__()
+        self.embed_dim = embed_dim
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.vision_dim = vision_dim
+        self.vision_heads = vision_heads
+        self.vision_layers = vision_layers
+        self.vocab_size = vocab_size
+        self.text_len = text_len
+        self.text_dim = text_dim
+        self.text_heads = text_heads
+        self.text_layers = text_layers
+
+        # models
+        self.visual = VisionTransformer(
+            image_size=image_size,
+            patch_size=patch_size,
+            dim=vision_dim,
+            out_dim=embed_dim,
+            num_heads=vision_heads,
+            num_layers=vision_layers,
+            attn_dropout=attn_dropout,
+            proj_dropout=proj_dropout,
+            embedding_dropout=embedding_dropout)
+        self.textual = TextTransformer(
+            vocab_size=vocab_size,
+            text_len=text_len,
+            dim=text_dim,
+            out_dim=embed_dim,
+            num_heads=text_heads,
+            num_layers=text_layers,
+            attn_dropout=attn_dropout,
+            proj_dropout=proj_dropout,
+            embedding_dropout=embedding_dropout)
+        self.log_scale = nn.Parameter(math.log(1 / 0.07) * torch.ones([]))
+
+    def forward(self, imgs, txt_tokens):
+        r"""imgs:       [B, C, H, W] of torch.float32.
+            txt_tokens: [B, T] of torch.long.
+        """
+        xi = self.visual(imgs)
+        xt = self.textual(txt_tokens)
+
+        # normalize features
+        xi = F.normalize(xi, p=2, dim=1)
+        xt = F.normalize(xt, p=2, dim=1)
+
+        # logits
+        scale = self.log_scale.exp()
+        logits_i2t = scale * torch.mm(xi, xt.t())
+        logits_t2i = scale * torch.mm(xt, xi.t())
+        return logits_i2t, logits_t2i
+
+    def init_weights(self):
+        # embeddings
+        nn.init.normal_(self.textual.token_embedding.weight, std=0.02)
+        nn.init.normal_(self.visual.patch_embedding.weight, tsd=0.1)
+
+        # attentions
+        for modality in ['visual', 'textual']:
+            dim = self.vision_dim if modality == 'visual' else 'textual'
+            transformer = getattr(self, modality).transformer
+            proj_gain = (1.0 / math.sqrt(dim)) * (
+                1.0 / math.sqrt(2 * transformer.num_layers))
+            attn_gain = 1.0 / math.sqrt(dim)
+            mlp_gain = 1.0 / math.sqrt(2.0 * dim)
+            for block in transformer.layers:
+                nn.init.normal_(block.attn.to_qkv.weight, std=attn_gain)
+                nn.init.normal_(block.attn.proj.weight, std=proj_gain)
+                nn.init.normal_(block.mlp[0].weight, std=mlp_gain)
+                nn.init.normal_(block.mlp[2].weight, std=proj_gain)
+
+    def fp16(self):
+        return self.apply(to_fp16)
diff --git a/modelscope/models/multi_modal/multi_stage_diffusion/decoder.py b/modelscope/models/multi_modal/multi_stage_diffusion/decoder.py
new file mode 100644
index 00000000..17daedaf
--- /dev/null
+++ b/modelscope/models/multi_modal/multi_stage_diffusion/decoder.py
@@ -0,0 +1,322 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+__all__ = ['Decoder']
+
+
+def sinusoidal_embedding(timesteps, dim):
+    # check input
+    half = dim // 2
+    timesteps = timesteps.float()
+
+    # compute sinusoidal embedding
+    sinusoid = torch.outer(
+        timesteps, torch.pow(10000,
+                             -torch.arange(half).to(timesteps).div(half)))
+    x = torch.cat([torch.cos(sinusoid), torch.sin(sinusoid)], dim=1)
+    if dim % 2 != 0:
+        x = torch.cat([x, torch.zeros_like(x[:, :1])], dim=1)
+    return x
+
+
+class Resample(nn.Module):
+
+    def __init__(self, in_dim, out_dim, scale_factor, use_conv=False):
+        assert scale_factor in [0.5, 1.0, 2.0]
+        super(Resample, self).__init__()
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        self.scale_factor = scale_factor
+        self.use_conv = use_conv
+
+        # layers
+        if scale_factor == 2.0:
+            self.resample = nn.Sequential(
+                nn.Upsample(scale_factor=scale_factor, mode='nearest'),
+                nn.Conv2d(in_dim, out_dim, 3, padding=1)
+                if use_conv else nn.Identity())
+        elif scale_factor == 0.5:
+            self.resample = nn.Conv2d(
+                in_dim, out_dim, 3, stride=2,
+                padding=1) if use_conv else nn.AvgPool2d(
+                    kernel_size=2, stride=2)
+        else:
+            self.resample = nn.Identity()
+
+    def forward(self, x):
+        return self.resample(x)
+
+
+class ResidualBlock(nn.Module):
+
+    def __init__(self,
+                 in_dim,
+                 embed_dim,
+                 out_dim,
+                 use_scale_shift_norm=True,
+                 scale_factor=1.0,
+                 dropout=0.0):
+        super(ResidualBlock, self).__init__()
+        self.in_dim = in_dim
+        self.embed_dim = embed_dim
+        self.out_dim = out_dim
+        self.use_scale_shift_norm = use_scale_shift_norm
+        self.scale_factor = scale_factor
+
+        # layers
+        self.layer1 = nn.Sequential(
+            nn.GroupNorm(32, in_dim), nn.SiLU(),
+            nn.Conv2d(in_dim, out_dim, 3, padding=1))
+        self.resample = Resample(in_dim, in_dim, scale_factor, use_conv=False)
+        self.embedding = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(embed_dim,
+                      out_dim * 2 if use_scale_shift_norm else out_dim))
+        self.layer2 = nn.Sequential(
+            nn.GroupNorm(32, out_dim), nn.SiLU(), nn.Dropout(dropout),
+            nn.Conv2d(out_dim, out_dim, 3, padding=1))
+        self.shortcut = nn.Identity() if in_dim == out_dim else nn.Conv2d(
+            in_dim, out_dim, 1)
+
+        # zero out the last layer params
+        nn.init.zeros_(self.layer2[-1].weight)
+
+    def forward(self, x, e):
+        identity = self.resample(x)
+        x = self.layer1[-1](self.resample(self.layer1[:-1](x)))
+        e = self.embedding(e).unsqueeze(-1).unsqueeze(-1).type(x.dtype)
+        if self.use_scale_shift_norm:
+            scale, shift = e.chunk(2, dim=1)
+            x = self.layer2[0](x) * (1 + scale) + shift
+            x = self.layer2[1:](x)
+        else:
+            x = x + e
+            x = self.layer2(x)
+        x = x + self.shortcut(identity)
+        return x
+
+
+class AttentionBlock(nn.Module):
+
+    def __init__(self, dim, context_dim=None, num_heads=None, head_dim=None):
+        # consider head_dim first, then num_heads
+        num_heads = dim // head_dim if head_dim else num_heads
+        head_dim = dim // num_heads
+        assert num_heads * head_dim == dim
+        super(AttentionBlock, self).__init__()
+        self.dim = dim
+        self.context_dim = context_dim
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+        self.scale = math.pow(head_dim, -0.25)
+
+        # layers
+        self.norm = nn.GroupNorm(32, dim)
+        self.to_qkv = nn.Conv2d(dim, dim * 3, 1)
+        if context_dim is not None:
+            self.context_kv = nn.Linear(context_dim, dim * 2)
+        self.proj = nn.Conv2d(dim, dim, 1)
+
+        # zero out the last layer params
+        nn.init.zeros_(self.proj.weight)
+
+    def forward(self, x, context=None):
+        r"""x:       [B, C, H, W].
+            context: [B, L, C] or None.
+        """
+        identity = x
+        b, c, h, w, n, d = *x.size(), self.num_heads, self.head_dim
+
+        # compute query, key, value
+        x = self.norm(x)
+        q, k, v = self.to_qkv(x).view(b, n * 3, d, h * w).chunk(3, dim=1)
+        if context is not None:
+            ck, cv = self.context_kv(context).reshape(b, -1, n * 2,
+                                                      d).permute(0, 2, 3,
+                                                                 1).chunk(
+                                                                     2, dim=1)
+            k = torch.cat([ck, k], dim=-1)
+            v = torch.cat([cv, v], dim=-1)
+
+        # compute attention
+        attn = torch.matmul(q.transpose(-1, -2) * self.scale, k * self.scale)
+        attn = F.softmax(attn, dim=-1)
+
+        # gather context
+        x = torch.matmul(v, attn.transpose(-1, -2))
+        x = x.reshape(b, c, h, w)
+
+        # output
+        x = self.proj(x)
+        return x + identity
+
+
+class Decoder(nn.Module):
+
+    def __init__(self,
+                 in_dim=3,
+                 dim=512,
+                 y_dim=512,
+                 context_dim=512,
+                 out_dim=6,
+                 dim_mult=[1, 2, 3, 4],
+                 num_heads=None,
+                 head_dim=64,
+                 num_res_blocks=3,
+                 attn_scales=[1 / 2, 1 / 4, 1 / 8],
+                 resblock_resample=True,
+                 use_scale_shift_norm=True,
+                 dropout=0.1):
+        embed_dim = dim * 4
+        super(Decoder, self).__init__()
+        self.in_dim = in_dim
+        self.dim = dim
+        self.y_dim = y_dim
+        self.context_dim = context_dim
+        self.embed_dim = embed_dim
+        self.out_dim = out_dim
+        self.dim_mult = dim_mult
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.resblock_resample = resblock_resample
+        self.use_scale_shift_norm = use_scale_shift_norm
+
+        # params
+        enc_dims = [dim * u for u in [1] + dim_mult]
+        dec_dims = [dim * u for u in [dim_mult[-1]] + dim_mult[::-1]]
+        shortcut_dims = []
+        scale = 1.0
+
+        # embeddings
+        self.time_embedding = nn.Sequential(
+            nn.Linear(dim, embed_dim), nn.SiLU(),
+            nn.Linear(embed_dim, embed_dim))
+        self.y_embedding = nn.Sequential(
+            nn.Linear(y_dim, embed_dim), nn.SiLU(),
+            nn.Linear(embed_dim, embed_dim))
+        self.context_embedding = nn.Sequential(
+            nn.Linear(y_dim, embed_dim), nn.SiLU(),
+            nn.Linear(embed_dim, context_dim * 4))
+
+        # encoder
+        self.encoder = nn.ModuleList(
+            [nn.Conv2d(self.in_dim, dim, 3, padding=1)])
+        shortcut_dims.append(dim)
+        for i, (in_dim,
+                out_dim) in enumerate(zip(enc_dims[:-1], enc_dims[1:])):
+            for j in range(num_res_blocks):
+                # residual (+attention) blocks
+                block = nn.ModuleList([
+                    ResidualBlock(in_dim, embed_dim, out_dim,
+                                  use_scale_shift_norm, 1.0, dropout)
+                ])
+                if scale in attn_scales:
+                    block.append(
+                        AttentionBlock(out_dim, context_dim, num_heads,
+                                       head_dim))
+                in_dim = out_dim
+                self.encoder.append(block)
+                shortcut_dims.append(out_dim)
+
+                # downsample
+                if i != len(dim_mult) - 1 and j == num_res_blocks - 1:
+                    if resblock_resample:
+                        downsample = ResidualBlock(out_dim, embed_dim, out_dim,
+                                                   use_scale_shift_norm, 0.5,
+                                                   dropout)
+                    else:
+                        downsample = Resample(
+                            out_dim, out_dim, 0.5, use_conv=True)
+                    shortcut_dims.append(out_dim)
+                    scale /= 2.0
+                    self.encoder.append(downsample)
+
+        # middle
+        self.middle = nn.ModuleList([
+            ResidualBlock(out_dim, embed_dim, out_dim, use_scale_shift_norm,
+                          1.0, dropout),
+            AttentionBlock(out_dim, context_dim, num_heads, head_dim),
+            ResidualBlock(out_dim, embed_dim, out_dim, use_scale_shift_norm,
+                          1.0, dropout)
+        ])
+
+        # decoder
+        self.decoder = nn.ModuleList()
+        for i, (in_dim,
+                out_dim) in enumerate(zip(dec_dims[:-1], dec_dims[1:])):
+            for j in range(num_res_blocks + 1):
+                # residual (+attention) blocks
+                block = nn.ModuleList([
+                    ResidualBlock(in_dim + shortcut_dims.pop(), embed_dim,
+                                  out_dim, use_scale_shift_norm, 1.0, dropout)
+                ])
+                if scale in attn_scales:
+                    block.append(
+                        AttentionBlock(out_dim, context_dim, num_heads,
+                                       head_dim))
+                in_dim = out_dim
+
+                # upsample
+                if i != len(dim_mult) - 1 and j == num_res_blocks:
+                    if resblock_resample:
+                        upsample = ResidualBlock(out_dim, embed_dim, out_dim,
+                                                 use_scale_shift_norm, 2.0,
+                                                 dropout)
+                    else:
+                        upsample = Resample(
+                            out_dim, out_dim, 2.0, use_conv=True)
+                    scale *= 2.0
+                    block.append(upsample)
+                self.decoder.append(block)
+
+        # head
+        self.head = nn.Sequential(
+            nn.GroupNorm(32, out_dim), nn.SiLU(),
+            nn.Conv2d(out_dim, self.out_dim, 3, padding=1))
+
+        # zero out the last layer params
+        nn.init.zeros_(self.head[-1].weight)
+
+    def forward(self, x, t, y):
+        # embeddings
+        e = self.time_embedding(sinusoidal_embedding(
+            t, self.dim)) + self.y_embedding(y)
+        context = self.context_embedding(y).view(-1, 4, self.context_dim)
+
+        # encoder
+        xs = []
+        for block in self.encoder:
+            x = self._forward_single(block, x, e, context)
+            xs.append(x)
+
+        # middle
+        for block in self.middle:
+            x = self._forward_single(block, x, e, context)
+
+        # decoder
+        for block in self.decoder:
+            x = torch.cat([x, xs.pop()], dim=1)
+            x = self._forward_single(block, x, e, context)
+
+        # head
+        x = self.head(x)
+        return x
+
+    def _forward_single(self, module, x, e, context):
+        if isinstance(module, ResidualBlock):
+            x = module(x, e)
+        elif isinstance(module, AttentionBlock):
+            x = module(x, context)
+        elif isinstance(module, nn.ModuleList):
+            for block in module:
+                x = self._forward_single(block, x, e, context)
+        else:
+            x = module(x)
+        return x
diff --git a/modelscope/models/multi_modal/multi_stage_diffusion/gaussian_diffusion.py b/modelscope/models/multi_modal/multi_stage_diffusion/gaussian_diffusion.py
new file mode 100644
index 00000000..a4fc52e0
--- /dev/null
+++ b/modelscope/models/multi_modal/multi_stage_diffusion/gaussian_diffusion.py
@@ -0,0 +1,641 @@
+# The implementation here is modified based on latent diffusion, publicly available
+# at https://github.com/CompVis/latent-diffusion.
+
+import math
+
+import torch
+
+__all__ = ['GaussianDiffusion', 'beta_schedule']
+
+
+def kl_divergence(mu1, logvar1, mu2, logvar2):
+    u1 = -1.0 + logvar2 - logvar1 + torch.exp(logvar1 - logvar2)
+    u2 = ((mu1 - mu2)**2) * torch.exp(-logvar2)
+    return 0.5 * (u1 + u2)
+
+
+def standard_normal_cdf(x):
+    r"""A fast approximation of the cumulative distribution function of the standard normal.
+    """
+    return 0.5 * (1.0 + torch.tanh(
+        math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+
+
+def discretized_gaussian_log_likelihood(x0, mean, log_scale):
+    assert x0.shape == mean.shape == log_scale.shape
+    cx = x0 - mean
+    inv_stdv = torch.exp(-log_scale)
+    cdf_plus = standard_normal_cdf(inv_stdv * (cx + 1.0 / 255.0))
+    cdf_min = standard_normal_cdf(inv_stdv * (cx - 1.0 / 255.0))
+    log_cdf_plus = torch.log(cdf_plus.clamp(min=1e-12))
+    log_one_minus_cdf_min = torch.log((1.0 - cdf_min).clamp(min=1e-12))
+    cdf_delta = cdf_plus - cdf_min
+    log_probs = torch.where(
+        x0 < -0.999, log_cdf_plus,
+        torch.where(x0 > 0.999, log_one_minus_cdf_min,
+                    torch.log(cdf_delta.clamp(min=1e-12))))
+    assert log_probs.shape == x0.shape
+    return log_probs
+
+
+def _i(tensor, t, x):
+    r"""Index tensor using t and format the output according to x.
+    """
+    shape = (x.size(0), ) + (1, ) * (x.ndim - 1)
+    return tensor[t].view(shape).to(x)
+
+
+def beta_schedule(schedule,
+                  num_timesteps=1000,
+                  init_beta=None,
+                  last_beta=None):
+    if schedule == 'linear':
+        scale = 1000.0 / num_timesteps
+        init_beta = init_beta or scale * 0.0001
+        last_beta = last_beta or scale * 0.02
+        return torch.linspace(
+            init_beta, last_beta, num_timesteps, dtype=torch.float64)
+    elif schedule == 'quadratic':
+        init_beta = init_beta or 0.0015
+        last_beta = last_beta or 0.0195
+        return torch.linspace(
+            init_beta**0.5, last_beta**0.5, num_timesteps,
+            dtype=torch.float64)**2
+    elif schedule == 'cosine':
+        betas = []
+        for step in range(num_timesteps):
+            t1 = step / num_timesteps
+            t2 = (step + 1) / num_timesteps
+            fn_t1 = math.cos((t1 + 0.008) / 1.008 * math.pi / 2)**2
+            fn_t2 = math.cos((t2 + 0.008) / 1.008 * math.pi / 2)**2
+            betas.append(min(1.0 - fn_t2 / fn_t1, 0.999))
+        return torch.tensor(betas, dtype=torch.float64)
+    else:
+        raise ValueError(f'Unsupported schedule: {schedule}')
+
+
+class GaussianDiffusion(object):
+
+    def __init__(self,
+                 betas,
+                 mean_type='eps',
+                 var_type='learned_range',
+                 loss_type='mse',
+                 rescale_timesteps=False):
+        # check input
+        if not isinstance(betas, torch.DoubleTensor):
+            betas = torch.tensor(betas, dtype=torch.float64)
+        assert min(betas) > 0 and max(betas) <= 1
+        assert mean_type in ['x0', 'x_{t-1}', 'eps']
+        assert var_type in [
+            'learned', 'learned_range', 'fixed_large', 'fixed_small'
+        ]
+        assert loss_type in [
+            'mse', 'rescaled_mse', 'kl', 'rescaled_kl', 'l1', 'rescaled_l1'
+        ]
+        self.betas = betas
+        self.num_timesteps = len(betas)
+        self.mean_type = mean_type
+        self.var_type = var_type
+        self.loss_type = loss_type
+        self.rescale_timesteps = rescale_timesteps
+
+        # alphas
+        alphas = 1 - self.betas
+        self.alphas_cumprod = torch.cumprod(alphas, dim=0)
+        self.alphas_cumprod_prev = torch.cat(
+            [alphas.new_ones([1]), self.alphas_cumprod[:-1]])
+        self.alphas_cumprod_next = torch.cat(
+            [self.alphas_cumprod[1:],
+             alphas.new_zeros([1])])
+
+        # q(x_t | x_{t-1})
+        self.sqrt_alphas_cumprod = torch.sqrt(self.alphas_cumprod)
+        self.sqrt_one_minus_alphas_cumprod = torch.sqrt(1.0
+                                                        - self.alphas_cumprod)
+        self.log_one_minus_alphas_cumprod = torch.log(1.0
+                                                      - self.alphas_cumprod)
+        self.sqrt_recip_alphas_cumprod = torch.sqrt(1.0 / self.alphas_cumprod)
+        self.sqrt_recipm1_alphas_cumprod = torch.sqrt(1.0 / self.alphas_cumprod
+                                                      - 1)
+
+        # q(x_{t-1} | x_t, x_0)
+        self.posterior_variance = betas * (1.0 - self.alphas_cumprod_prev) / (
+            1.0 - self.alphas_cumprod)
+        self.posterior_log_variance_clipped = torch.log(
+            self.posterior_variance.clamp(1e-20))
+        self.posterior_mean_coef1 = betas * torch.sqrt(
+            self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
+        self.posterior_mean_coef2 = (
+            1.0 - self.alphas_cumprod_prev) * torch.sqrt(alphas) / (
+                1.0 - self.alphas_cumprod)
+
+    def q_sample(self, x0, t, noise=None):
+        r"""Sample from q(x_t | x_0).
+        """
+        noise = torch.randn_like(x0) if noise is None else noise
+        u1 = _i(self.sqrt_alphas_cumprod, t, x0) * x0
+        u2 = _i(self.sqrt_one_minus_alphas_cumprod, t, x0) * noise
+        return u1 + u2
+
+    def q_mean_variance(self, x0, t):
+        r"""Distribution of q(x_t | x_0).
+        """
+        mu = _i(self.sqrt_alphas_cumprod, t, x0) * x0
+        var = _i(1.0 - self.alphas_cumprod, t, x0)
+        log_var = _i(self.log_one_minus_alphas_cumprod, t, x0)
+        return mu, var, log_var
+
+    def q_posterior_mean_variance(self, x0, xt, t):
+        r"""Distribution of q(x_{t-1} | x_t, x_0).
+        """
+        mu = _i(self.posterior_mean_coef1, t, xt) * x0 + _i(
+            self.posterior_mean_coef2, t, xt) * xt
+        var = _i(self.posterior_variance, t, xt)
+        log_var = _i(self.posterior_log_variance_clipped, t, xt)
+        return mu, var, log_var
+
+    @torch.no_grad()
+    def p_sample(self,
+                 xt,
+                 t,
+                 model,
+                 model_kwargs={},
+                 clamp=None,
+                 percentile=None,
+                 condition_fn=None,
+                 guide_scale=None):
+        r"""Sample from p(x_{t-1} | x_t).
+            - condition_fn: for classifier-based guidance (guided-diffusion).
+            - guide_scale: for classifier-free guidance (glide/dalle-2).
+        """
+        # predict distribution of p(x_{t-1} | x_t)
+        mu, var, log_var, x0 = self.p_mean_variance(xt, t, model, model_kwargs,
+                                                    clamp, percentile,
+                                                    guide_scale)
+
+        # random sample (with optional conditional function)
+        noise = torch.randn_like(xt)
+        shape = (-1, *((1, ) * (xt.ndim - 1)))
+        mask = t.ne(0).float().view(shape)  # no noise when t == 0
+        if condition_fn is not None:
+            grad = condition_fn(xt, self._scale_timesteps(t), **model_kwargs)
+            mu = mu.float() + var * grad.float()
+        xt_1 = mu + mask * torch.exp(0.5 * log_var) * noise
+        return xt_1, x0
+
+    @torch.no_grad()
+    def p_sample_loop(self,
+                      noise,
+                      model,
+                      model_kwargs={},
+                      clamp=None,
+                      percentile=None,
+                      condition_fn=None,
+                      guide_scale=None):
+        r"""Sample from p(x_{t-1} | x_t) p(x_{t-2} | x_{t-1}) ... p(x_0 | x_1).
+        """
+        # prepare input
+        b = noise.size(0)
+        xt = noise
+
+        # diffusion process
+        for step in torch.arange(self.num_timesteps).flip(0):
+            t = torch.full((b, ), step, dtype=torch.long, device=xt.device)
+            xt, _ = self.p_sample(xt, t, model, model_kwargs, clamp,
+                                  percentile, condition_fn, guide_scale)
+        return xt
+
+    def p_mean_variance(self,
+                        xt,
+                        t,
+                        model,
+                        model_kwargs={},
+                        clamp=None,
+                        percentile=None,
+                        guide_scale=None):
+        r"""Distribution of p(x_{t-1} | x_t).
+        """
+        # predict distribution
+        if guide_scale is None:
+            out = model(xt, self._scale_timesteps(t), **model_kwargs)
+        else:
+            # classifier-free guidance
+            # (model_kwargs[0]: conditional kwargs; model_kwargs[1]: non-conditional kwargs)
+            assert isinstance(model_kwargs, list) and len(model_kwargs) == 2
+            y_out = model(xt, self._scale_timesteps(t), **model_kwargs[0])
+            u_out = model(xt, self._scale_timesteps(t), **model_kwargs[1])
+            cond = self.var_type.startswith('fixed')
+            dim = y_out.size(1) if cond else y_out.size(1) // 2
+            u1 = u_out[:, :dim]
+            u2 = guide_scale * (y_out[:, :dim] - u_out[:, :dim])
+            out = torch.cat([u1 + u2, y_out[:, dim:]], dim=1)
+
+        # compute variance
+        if self.var_type == 'learned':
+            out, log_var = out.chunk(2, dim=1)
+            var = torch.exp(log_var)
+        elif self.var_type == 'learned_range':
+            out, fraction = out.chunk(2, dim=1)
+            min_log_var = _i(self.posterior_log_variance_clipped, t, xt)
+            max_log_var = _i(torch.log(self.betas), t, xt)
+            fraction = (fraction + 1) / 2.0
+            log_var = fraction * max_log_var + (1 - fraction) * min_log_var
+            var = torch.exp(log_var)
+        elif self.var_type == 'fixed_large':
+            var = _i(
+                torch.cat([self.posterior_variance[1:2], self.betas[1:]]), t,
+                xt)
+            log_var = torch.log(var)
+        elif self.var_type == 'fixed_small':
+            var = _i(self.posterior_variance, t, xt)
+            log_var = _i(self.posterior_log_variance_clipped, t, xt)
+
+        # compute mean and x0
+        if self.mean_type == 'x_{t-1}':
+            mu = out  # x_{t-1}
+            u1 = _i(1.0 / self.posterior_mean_coef1, t, xt) * mu
+            u2 = _i(self.posterior_mean_coef2 / self.posterior_mean_coef1, t,
+                    xt) * xt
+            x0 = u1 - u2
+        elif self.mean_type == 'x0':
+            x0 = out
+            mu, _, _ = self.q_posterior_mean_variance(x0, xt, t)
+        elif self.mean_type == 'eps':
+            u1 = _i(self.sqrt_recip_alphas_cumprod, t, xt) * xt
+            u2 = _i(self.sqrt_recipm1_alphas_cumprod, t, xt) * out
+            x0 = u1 - u2
+            mu, _, _ = self.q_posterior_mean_variance(x0, xt, t)
+
+        # restrict the range of x0
+        if percentile is not None:
+            assert percentile > 0 and percentile <= 1  # e.g., 0.995
+            s = torch.quantile(
+                x0.flatten(1).abs(), percentile,
+                dim=1).clamp_(1.0).view(-1, 1, 1, 1)
+            x0 = torch.min(s, torch.max(-s, x0)) / s
+        elif clamp is not None:
+            x0 = x0.clamp(-clamp, clamp)
+        return mu, var, log_var, x0
+
+    @torch.no_grad()
+    def ddim_sample(self,
+                    xt,
+                    t,
+                    model,
+                    model_kwargs={},
+                    clamp=None,
+                    percentile=None,
+                    condition_fn=None,
+                    guide_scale=None,
+                    ddim_timesteps=20,
+                    eta=0.0):
+        r"""Sample from p(x_{t-1} | x_t) using DDIM.
+            - condition_fn: for classifier-based guidance (guided-diffusion).
+            - guide_scale: for classifier-free guidance (glide/dalle-2).
+        """
+        stride = self.num_timesteps // ddim_timesteps
+
+        # predict distribution of p(x_{t-1} | x_t)
+        _, _, _, x0 = self.p_mean_variance(xt, t, model, model_kwargs, clamp,
+                                           percentile, guide_scale)
+        if condition_fn is not None:
+            # x0 -> eps
+            alpha = _i(self.alphas_cumprod, t, xt)
+            u1 = (_i(self.sqrt_recip_alphas_cumprod, t, xt) * xt - x0)
+            u2 = _i(self.sqrt_recipm1_alphas_cumprod, t, xt)
+            eps = u1 / u2
+            eps = eps - (1 - alpha).sqrt() * condition_fn(
+                xt, self._scale_timesteps(t), **model_kwargs)
+
+            # eps -> x0
+            u1 = _i(self.sqrt_recip_alphas_cumprod, t, xt) * xt
+            u2 = _i(self.sqrt_recipm1_alphas_cumprod, t, xt) * eps
+            x0 = u1 - u2
+
+        # derive variables
+        u1 = (_i(self.sqrt_recip_alphas_cumprod, t, xt) * xt - x0)
+        u2 = _i(self.sqrt_recipm1_alphas_cumprod, t, xt)
+        eps = u1 / u2
+        alphas = _i(self.alphas_cumprod, t, xt)
+        alphas_prev = _i(self.alphas_cumprod, (t - stride).clamp(0), xt)
+        u1 = (1 - alphas_prev) / (1 - alphas)
+        u2 = (1 - alphas / alphas_prev)
+        sigmas = eta * torch.sqrt(u1 * u2)
+
+        # random sample
+        noise = torch.randn_like(xt)
+        direction = torch.sqrt(1 - alphas_prev - sigmas**2) * eps
+        mask = t.ne(0).float().view(-1, *((1, ) * (xt.ndim - 1)))
+        xt_1 = torch.sqrt(alphas_prev) * x0 + direction + mask * sigmas * noise
+        return xt_1, x0
+
+    @torch.no_grad()
+    def ddim_sample_loop(self,
+                         noise,
+                         model,
+                         model_kwargs={},
+                         clamp=None,
+                         percentile=None,
+                         condition_fn=None,
+                         guide_scale=None,
+                         ddim_timesteps=20,
+                         eta=0.0):
+        # prepare input
+        b = noise.size(0)
+        xt = noise
+
+        # diffusion process (TODO: clamp is inaccurate! Consider replacing the stride by explicit prev/next steps)
+        steps = (1 + torch.arange(0, self.num_timesteps,
+                                  self.num_timesteps // ddim_timesteps)).clamp(
+                                      0, self.num_timesteps - 1).flip(0)
+        for step in steps:
+            t = torch.full((b, ), step, dtype=torch.long, device=xt.device)
+            xt, _ = self.ddim_sample(xt, t, model, model_kwargs, clamp,
+                                     percentile, condition_fn, guide_scale,
+                                     ddim_timesteps, eta)
+        return xt
+
+    @torch.no_grad()
+    def ddim_reverse_sample(self,
+                            xt,
+                            t,
+                            model,
+                            model_kwargs={},
+                            clamp=None,
+                            percentile=None,
+                            guide_scale=None,
+                            ddim_timesteps=20):
+        r"""Sample from p(x_{t+1} | x_t) using DDIM reverse ODE (deterministic).
+        """
+        stride = self.num_timesteps // ddim_timesteps
+
+        # predict distribution of p(x_{t-1} | x_t)
+        _, _, _, x0 = self.p_mean_variance(xt, t, model, model_kwargs, clamp,
+                                           percentile, guide_scale)
+
+        # derive variables
+        u1 = (_i(self.sqrt_recip_alphas_cumprod, t, xt) * xt - x0)
+        u2 = _i(self.sqrt_recipm1_alphas_cumprod, t, xt)
+        eps = u1 / u2
+
+        alphas_next = _i(
+            torch.cat(
+                [self.alphas_cumprod,
+                 self.alphas_cumprod.new_zeros([1])]),
+            (t + stride).clamp(0, self.num_timesteps), xt)
+
+        # reverse sample
+        mu = torch.sqrt(alphas_next) * x0 + torch.sqrt(1 - alphas_next) * eps
+        return mu, x0
+
+    @torch.no_grad()
+    def ddim_reverse_sample_loop(self,
+                                 x0,
+                                 model,
+                                 model_kwargs={},
+                                 clamp=None,
+                                 percentile=None,
+                                 guide_scale=None,
+                                 ddim_timesteps=20):
+        # prepare input
+        b = x0.size(0)
+        xt = x0
+
+        # reconstruction steps
+        steps = torch.arange(0, self.num_timesteps,
+                             self.num_timesteps // ddim_timesteps)
+        for step in steps:
+            t = torch.full((b, ), step, dtype=torch.long, device=xt.device)
+            xt, _ = self.ddim_reverse_sample(xt, t, model, model_kwargs, clamp,
+                                             percentile, guide_scale,
+                                             ddim_timesteps)
+        return xt
+
+    @torch.no_grad()
+    def plms_sample(self,
+                    xt,
+                    t,
+                    model,
+                    model_kwargs={},
+                    clamp=None,
+                    percentile=None,
+                    condition_fn=None,
+                    guide_scale=None,
+                    plms_timesteps=20):
+        r"""Sample from p(x_{t-1} | x_t) using PLMS.
+            - condition_fn: for classifier-based guidance (guided-diffusion).
+            - guide_scale: for classifier-free guidance (glide/dalle-2).
+        """
+        stride = self.num_timesteps // plms_timesteps
+
+        # function for compute eps
+        def compute_eps(xt, t):
+            # predict distribution of p(x_{t-1} | x_t)
+            _, _, _, x0 = self.p_mean_variance(xt, t, model, model_kwargs,
+                                               clamp, percentile, guide_scale)
+
+            # condition
+            if condition_fn is not None:
+                # x0 -> eps
+                alpha = _i(self.alphas_cumprod, t, xt)
+                u1 = (_i(self.sqrt_recip_alphas_cumprod, t, xt) * xt - x0)
+                u2 = _i(self.sqrt_recipm1_alphas_cumprod, t, xt)
+                eps = u1 / u2
+                eps = eps - (1 - alpha).sqrt() * condition_fn(
+                    xt, self._scale_timesteps(t), **model_kwargs)
+
+                # eps -> x0
+                u1 = _i(self.sqrt_recip_alphas_cumprod, t, xt) * xt
+                u2 = _i(self.sqrt_recipm1_alphas_cumprod, t, xt) * eps
+                x0 = u1 - u2
+
+            # derive eps
+            u1 = (_i(self.sqrt_recip_alphas_cumprod, t, xt) * xt - x0)
+            u2 = _i(self.sqrt_recipm1_alphas_cumprod, t, xt)
+            eps = u1 / u2
+            return eps
+
+        # function for compute x_0 and x_{t-1}
+        def compute_x0(eps, t):
+            # eps -> x0
+            u1 = _i(self.sqrt_recip_alphas_cumprod, t, xt) * xt
+            u2 = _i(self.sqrt_recipm1_alphas_cumprod, t, xt) * eps
+            x0 = u1 - u2
+
+            # deterministic sample
+            alphas_prev = _i(self.alphas_cumprod, (t - stride).clamp(0), xt)
+            direction = torch.sqrt(1 - alphas_prev) * eps
+            # mask = t.ne(0).float().view(-1, *((1, ) * (xt.ndim - 1)))
+            xt_1 = torch.sqrt(alphas_prev) * x0 + direction
+            return xt_1, x0
+
+        # PLMS sample
+        eps = compute_eps(xt, t)
+        if len(eps_cache) == 0:
+            # 2nd order pseudo improved Euler
+            xt_1, x0 = compute_x0(eps, t)
+            eps_next = compute_eps(xt_1, (t - stride).clamp(0))
+            eps_prime = (eps + eps_next) / 2.0
+        elif len(eps_cache) == 1:
+            # 2nd order pseudo linear multistep (Adams-Bashforth)
+            eps_prime = (3 * eps - eps_cache[-1]) / 2.0
+        elif len(eps_cache) == 2:
+            # 3nd order pseudo linear multistep (Adams-Bashforth)
+            eps_prime = (23 * eps - 16 * eps_cache[-1]
+                         + 5 * eps_cache[-2]) / 12.0
+        elif len(eps_cache) >= 3:
+            # 4nd order pseudo linear multistep (Adams-Bashforth)
+            eps_prime = (55 * eps - 59 * eps_cache[-1] + 37 * eps_cache[-2]
+                         - 9 * eps_cache[-3]) / 24.0
+        xt_1, x0 = compute_x0(eps_prime, t)
+        return xt_1, x0, eps
+
+    @torch.no_grad()
+    def plms_sample_loop(self,
+                         noise,
+                         model,
+                         model_kwargs={},
+                         clamp=None,
+                         percentile=None,
+                         condition_fn=None,
+                         guide_scale=None,
+                         plms_timesteps=20):
+        # prepare input
+        b = noise.size(0)
+        xt = noise
+
+        # diffusion process
+        steps = (1 + torch.arange(0, self.num_timesteps,
+                                  self.num_timesteps // plms_timesteps)).clamp(
+                                      0, self.num_timesteps - 1).flip(0)
+        eps_cache = []
+        for step in steps:
+            # PLMS sampling step
+            t = torch.full((b, ), step, dtype=torch.long, device=xt.device)
+            xt, _, eps = self.plms_sample(xt, t, model, model_kwargs, clamp,
+                                          percentile, condition_fn,
+                                          guide_scale, plms_timesteps,
+                                          eps_cache)
+
+            # update eps cache
+            eps_cache.append(eps)
+            if len(eps_cache) >= 4:
+                eps_cache.pop(0)
+        return xt
+
+    def loss(self, x0, t, model, model_kwargs={}, noise=None, input_x0=None):
+        noise = torch.randn_like(x0) if noise is None else noise
+        input_x0 = x0 if input_x0 is None else input_x0
+        xt = self.q_sample(input_x0, t, noise=noise)
+
+        # compute loss
+        if self.loss_type in ['kl', 'rescaled_kl']:
+            loss, _ = self.variational_lower_bound(x0, xt, t, model,
+                                                   model_kwargs)
+            if self.loss_type == 'rescaled_kl':
+                loss = loss * self.num_timesteps
+        elif self.loss_type in ['mse', 'rescaled_mse', 'l1', 'rescaled_l1']:
+            out = model(xt, self._scale_timesteps(t), **model_kwargs)
+
+            # VLB for variation
+            loss_vlb = 0.0
+            if self.var_type in ['learned', 'learned_range']:
+                out, var = out.chunk(2, dim=1)
+                frozen = torch.cat([
+                    out.detach(), var
+                ], dim=1)  # learn var without affecting the prediction of mean
+                loss_vlb, _ = self.variational_lower_bound(
+                    x0, xt, t, model=lambda *args, **kwargs: frozen)
+                if self.loss_type.startswith('rescaled_'):
+                    loss_vlb = loss_vlb * self.num_timesteps / 1000.0
+
+            # MSE/L1 for x0/eps
+            target = {
+                'eps': noise,
+                'x0': x0,
+                'x_{t-1}': self.q_posterior_mean_variance(x0, xt, t)[0]
+            }[self.mean_type]
+            loss = (out - target).pow(1 if self.loss_type.endswith('l1') else 2
+                                      ).abs().flatten(1).mean(dim=1)
+
+            # total loss
+            loss = loss + loss_vlb
+        return loss
+
+    def variational_lower_bound(self,
+                                x0,
+                                xt,
+                                t,
+                                model,
+                                model_kwargs={},
+                                clamp=None,
+                                percentile=None):
+        # compute groundtruth and predicted distributions
+        mu1, _, log_var1 = self.q_posterior_mean_variance(x0, xt, t)
+        mu2, _, log_var2, x0 = self.p_mean_variance(xt, t, model, model_kwargs,
+                                                    clamp, percentile)
+
+        # compute KL loss
+        kl = kl_divergence(mu1, log_var1, mu2, log_var2)
+        kl = kl.flatten(1).mean(dim=1) / math.log(2.0)
+
+        # compute discretized NLL loss (for p(x0 | x1) only)
+        nll = -discretized_gaussian_log_likelihood(
+            x0, mean=mu2, log_scale=0.5 * log_var2)
+        nll = nll.flatten(1).mean(dim=1) / math.log(2.0)
+
+        # NLL for p(x0 | x1) and KL otherwise
+        vlb = torch.where(t == 0, nll, kl)
+        return vlb, x0
+
+    @torch.no_grad()
+    def variational_lower_bound_loop(self,
+                                     x0,
+                                     model,
+                                     model_kwargs={},
+                                     clamp=None,
+                                     percentile=None):
+        r"""Compute the entire variational lower bound, measured in bits-per-dim.
+        """
+        # prepare input and output
+        b = x0.size(0)
+        metrics = {'vlb': [], 'mse': [], 'x0_mse': []}
+
+        # loop
+        for step in torch.arange(self.num_timesteps).flip(0):
+            # compute VLB
+            t = torch.full((b, ), step, dtype=torch.long, device=x0.device)
+            noise = torch.randn_like(x0)
+            xt = self.q_sample(x0, t, noise)
+            vlb, pred_x0 = self.variational_lower_bound(
+                x0, xt, t, model, model_kwargs, clamp, percentile)
+
+            # predict eps from x0
+            u1 = (_i(self.sqrt_recip_alphas_cumprod, t, xt) * xt - x0)
+            u2 = _i(self.sqrt_recipm1_alphas_cumprod, t, xt)
+            eps = u1 / u2
+
+            # collect metrics
+            metrics['vlb'].append(vlb)
+            metrics['x0_mse'].append(
+                (pred_x0 - x0).square().flatten(1).mean(dim=1))
+            metrics['mse'].append(
+                (eps - noise).square().flatten(1).mean(dim=1))
+        metrics = {k: torch.stack(v, dim=1) for k, v in metrics.items()}
+
+        # compute the prior KL term for VLB, measured in bits-per-dim
+        mu, _, log_var = self.q_mean_variance(x0, t)
+        kl_prior = kl_divergence(mu, log_var, torch.zeros_like(mu),
+                                 torch.zeros_like(log_var))
+        kl_prior = kl_prior.flatten(1).mean(dim=1) / math.log(2.0)
+
+        # update metrics
+        metrics['prior_bits_per_dim'] = kl_prior
+        metrics['total_bits_per_dim'] = metrics['vlb'].sum(dim=1) + kl_prior
+        return metrics
+
+    def _scale_timesteps(self, t):
+        if self.rescale_timesteps:
+            return t.float() * 1000.0 / self.num_timesteps
+        return t
diff --git a/modelscope/models/multi_modal/multi_stage_diffusion/model.py b/modelscope/models/multi_modal/multi_stage_diffusion/model.py
new file mode 100644
index 00000000..c2d83b34
--- /dev/null
+++ b/modelscope/models/multi_modal/multi_stage_diffusion/model.py
@@ -0,0 +1,265 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import math
+import os.path as osp
+from typing import Any, Dict
+
+import json
+import numpy as np
+import torch
+import torch.cuda.amp as amp
+import torch.nn as nn
+import torch.nn.functional as F
+from PIL import Image
+
+from modelscope.metainfo import Models
+from modelscope.models import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.models.multi_modal.multi_stage_diffusion.clip import CLIP
+from modelscope.models.multi_modal.multi_stage_diffusion.decoder import Decoder
+from modelscope.models.multi_modal.multi_stage_diffusion.gaussian_diffusion import (
+    GaussianDiffusion, beta_schedule)
+from modelscope.models.multi_modal.multi_stage_diffusion.prior import Prior
+from modelscope.models.multi_modal.multi_stage_diffusion.tokenizer import (
+    CLIPTokenizer, XGLMTokenizer)
+from modelscope.models.multi_modal.multi_stage_diffusion.upsampler import (
+    Upsampler256, Upsampler1024)
+from modelscope.models.multi_modal.multi_stage_diffusion.xglm import XGLM
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+__all__ = ['MultiStageDiffusionForTextToImageSynthesis']
+
+
+def make_diffusion(schedule,
+                   num_timesteps=1000,
+                   init_beta=None,
+                   last_beta=None,
+                   mean_type='eps',
+                   var_type='fixed_small'):
+    betas = beta_schedule(schedule, num_timesteps, init_beta, last_beta)
+    diffusion = GaussianDiffusion(
+        betas, mean_type=mean_type, var_type=var_type)
+    return diffusion
+
+
+class UnCLIP(nn.Module):
+
+    def __init__(self, model_dir):
+        super(UnCLIP, self).__init__()
+        self.model_dir = model_dir
+        self.config = json.load(open(f'{model_dir}/{ModelFile.CONFIGURATION}'))
+
+        # modules
+        self.clip = CLIP(**self.config['clip']).fp16()
+        self.xglm = XGLM(**self.config['xglm'])
+        self.prior = Prior(**self.config['prior'])
+        self.decoder = Decoder(**self.config['decoder'])
+        self.upsampler256 = Upsampler256(**self.config['upsampler256'])
+        self.upsampler1024 = Upsampler1024(**self.config['upsampler1024'])
+
+        # diffusions
+        self.prior_diffusion = make_diffusion(**self.config['prior_diffusion'])
+        self.decoder_diffusion = make_diffusion(
+            **self.config['decoder_diffusion'])
+        self.upsampler256_diffusion = make_diffusion(
+            **self.config['upsampler256_diffusion'])
+        self.upsampler1024_diffusion = make_diffusion(
+            **self.config['upsampler1024_diffusion'])
+
+        # tokenizers
+        self.clip_tokenizer = CLIPTokenizer(
+            bpe_path=f'{model_dir}/bpe_simple_vocab_16e6.txt.gz')
+        self.xglm_tokenizer = XGLMTokenizer(model_dir=model_dir)
+
+    def forward(self, *args, **kwargs):
+        raise NotImplementedError(
+            '"forward" is not implemented. Use "synthesis" instead.')
+
+    @torch.no_grad()
+    def synthesis(self,
+                  text='A photo of a confused grizzly bear in calculus class.',
+                  tokenizer='clip',
+                  batch_size=4,
+                  timesteps_prior=100,
+                  timesteps_64=50,
+                  timesteps_256=20,
+                  timesteps_1024=20,
+                  guide_prior=3.0,
+                  guide_64=7.0,
+                  guide_256=3.0,
+                  guide_1024=3.0,
+                  eta_prior=0.0,
+                  eta_64=0.0,
+                  eta_256=0.0,
+                  eta_1024=0.0):
+        device = next(self.parameters()).device
+
+        # check params
+        assert all([
+            t > 0 and t <= 1000 for t in
+            [timesteps_prior, timesteps_64, timesteps_256, timesteps_1024]
+        ])
+        assert all([
+            g > 1 and g < 15
+            for g in [guide_prior, guide_64, guide_256, guide_1024]
+        ])
+        assert all([
+            e >= 0 and e <= 1.0
+            for e in [eta_prior, eta_64, eta_256, eta_1024]
+        ])
+        assert batch_size >= 1 and batch_size <= 16
+
+        # tokenize the text
+        if tokenizer == 'clip':
+            y = F.normalize(
+                self.clip.textual(self.clip_tokenizer([text]).to(device)),
+                p=2,
+                dim=1)
+            zero_y = F.normalize(
+                self.clip.textual(self.clip_tokenizer(['']).to(device)),
+                p=2,
+                dim=1)
+        elif tokenizer == 'xglm':
+            y = F.normalize(
+                self.xglm(*to_device(self.xglm_tokenizer([text]), device)),
+                p=2,
+                dim=1)
+            zero_y = F.normalize(
+                self.xglm(*to_device(self.xglm_tokenizer(['']), device)),
+                p=2,
+                dim=1)
+        else:
+            raise ValueError(
+                f'Expected tokenizer to be one of "clip" or "xglm", but got {tokenizer}'
+            )
+        y = math.sqrt(y.size(1)) * y.repeat(batch_size, 1)
+        zero_y = math.sqrt(zero_y.size(1)) * zero_y.repeat(batch_size, 1)
+
+        # synthesis
+        with amp.autocast(enabled=True):
+            # prior
+            x0 = self.prior_diffusion.ddim_sample_loop(
+                noise=torch.randn_like(y),
+                model=self.prior,
+                model_kwargs=[{
+                    'y': y
+                }, {
+                    'y': zero_y
+                }],
+                guide_scale=guide_prior,
+                ddim_timesteps=timesteps_prior,
+                eta=eta_prior)
+
+            # decoder
+            imgs64 = self.decoder_diffusion.ddim_sample_loop(
+                noise=torch.randn(batch_size, 3, 64, 64).to(device),
+                model=self.decoder,
+                model_kwargs=[{
+                    'y': x0
+                }, {
+                    'y': torch.zeros_like(x0)
+                }],
+                guide_scale=guide_64,
+                percentile=0.995,
+                ddim_timesteps=timesteps_64,
+                eta=eta_64).clamp_(-1, 1)
+
+            # upsampler256
+            imgs256 = F.interpolate(
+                imgs64, scale_factor=4.0, mode='bilinear', align_corners=False)
+            imgs256 = self.upsampler256_diffusion.ddim_sample_loop(
+                noise=torch.randn_like(imgs256),
+                model=self.upsampler256,
+                model_kwargs=[{
+                    'y': y,
+                    'concat': imgs256
+                }, {
+                    'y': zero_y,
+                    'concat': imgs256
+                }],
+                guide_scale=guide_256,
+                percentile=0.995,
+                ddim_timesteps=timesteps_256,
+                eta=eta_256).clamp_(-1, 1)
+
+            # upsampler1024
+            imgs1024 = F.interpolate(
+                imgs256,
+                scale_factor=4.0,
+                mode='bilinear',
+                align_corners=False)
+            imgs1024 = self.upsampler1024_diffusion.ddim_sample_loop(
+                noise=torch.randn_like(imgs1024),
+                model=self.upsampler1024,
+                model_kwargs=[{
+                    'y': y,
+                    'concat': imgs1024
+                }, {
+                    'y': zero_y,
+                    'concat': imgs1024
+                }],
+                guide_scale=guide_1024,
+                percentile=0.995,
+                ddim_timesteps=timesteps_1024,
+                eta=eta_1024).clamp_(-1, 1)
+
+        # output ([B, C, H, W] within range [0, 1])
+        imgs1024 = imgs1024.add_(1).mul_(255 / 2.0).permute(0, 2, 3, 1).cpu()
+        imgs1024 = [
+            Image.fromarray(np.array(u, dtype=np.uint8)) for u in imgs1024
+        ]
+        return imgs1024
+
+
+@MODELS.register_module(
+    Tasks.text_to_image_synthesis, module_name=Models.multi_stage_diffusion)
+class MultiStageDiffusionForTextToImageSynthesis(TorchModel):
+
+    def __init__(self, model_dir, device_id=-1):
+        super().__init__(model_dir=model_dir, device_id=device_id)
+        model = UnCLIP(model_dir=model_dir)
+        pretrained_params = torch.load(
+            osp.join(model_dir, ModelFile.TORCH_MODEL_BIN_FILE), 'cpu')
+        model.load_state_dict(pretrained_params)
+        model.eval()
+
+        self.device_id = device_id
+        if self.device_id >= 0:
+            self.device = torch.device(f'cuda:{self.device_id}')
+            model.to('cuda:{}'.format(self.device_id))
+            logger.info('Use GPU: {}'.format(self.device_id))
+        else:
+            self.device = torch.device('cpu')
+            logger.info('Use CPU for inference')
+        self.model = model
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        if not isinstance(input, dict):
+            raise ValueError(
+                f'Expected the input to be a dictionary, but got {type(input)}'
+            )
+        if 'text' not in input:
+            raise ValueError('input should contain "text", but not found')
+
+        # ddim sampling
+        imgs = self.model.synthesis(
+            text=input.get('text'),
+            tokenizer=input.get('tokenizer', 'clip'),
+            batch_size=input.get('batch_size', 4),
+            timesteps_prior=input.get('timesteps_prior', 100),
+            timesteps_64=input.get('timesteps_64', 50),
+            timesteps_256=input.get('timesteps_256', 20),
+            timesteps_1024=input.get('timesteps_1024', 20),
+            guide_prior=input.get('guide_prior', 3.0),
+            guide_64=input.get('guide_64', 7.0),
+            guide_256=input.get('guide_256', 3.0),
+            guide_1024=input.get('guide_1024', 3.0),
+            eta_prior=input.get('eta_prior', 0.0),
+            eta_64=input.get('eta_64', 0.0),
+            eta_256=input.get('eta_256', 0.0),
+            eta_1024=input.get('eta_1024', 0.0))
+        imgs = [np.array(u)[..., ::-1] for u in imgs]
+        return imgs
diff --git a/modelscope/models/multi_modal/multi_stage_diffusion/prior.py b/modelscope/models/multi_modal/multi_stage_diffusion/prior.py
new file mode 100644
index 00000000..380fa467
--- /dev/null
+++ b/modelscope/models/multi_modal/multi_stage_diffusion/prior.py
@@ -0,0 +1,170 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+__all__ = ['Prior']
+
+
+def sinusoidal_embedding(timesteps, dim):
+    # check input
+    half = dim // 2
+    timesteps = timesteps.float()
+
+    # compute sinusoidal embedding
+    sinusoid = torch.outer(
+        timesteps, torch.pow(10000,
+                             -torch.arange(half).to(timesteps).div(half)))
+    x = torch.cat([torch.cos(sinusoid), torch.sin(sinusoid)], dim=1)
+    if dim % 2 != 0:
+        x = torch.cat([x, torch.zeros_like(x[:, :1])], dim=1)
+    return x
+
+
+class SelfAttention(nn.Module):
+
+    def __init__(self, dim, num_heads):
+        assert dim % num_heads == 0
+        super(SelfAttention, self).__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = math.pow(self.head_dim, -0.25)
+
+        # layers
+        self.to_qkv = nn.Linear(dim, dim * 3)
+        self.proj = nn.Linear(dim, dim)
+
+    def forward(self, x, mask):
+        b, l, n, c = *x.shape[:2], self.num_heads, self.head_dim
+
+        # compute query, key, value
+        q, k, v = self.to_qkv(x).view(b, l, n * 3, c).chunk(3, dim=2)
+
+        # compute attention
+        attn = torch.einsum('binc,bjnc->bnij', q * self.scale, k * self.scale)
+        if mask is not None:
+            attn = attn.masked_fill(mask[:, :, :l, :l] == 0, float('-inf'))
+        attn = F.softmax(attn.float(), dim=-1).type(attn.dtype)
+
+        # gather context
+        x = torch.einsum('bnij,bjnc->binc', attn, v)
+        x = x.reshape(b, l, -1)
+
+        # output
+        x = self.proj(x)
+        return x
+
+
+class AttentionBlock(nn.Module):
+
+    def __init__(self, dim, num_heads):
+        super(AttentionBlock, self).__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+
+        # layers
+        self.norm1 = nn.LayerNorm(dim)
+        self.attn = SelfAttention(dim, num_heads)
+        self.norm2 = nn.LayerNorm(dim)
+        self.ffn = nn.Sequential(
+            nn.Linear(dim, dim * 4), nn.GELU(), nn.Linear(dim * 4, dim))
+
+    def forward(self, x, mask=None):
+        x = x + self.attn(self.norm1(x), mask)
+        x = x + self.ffn(self.norm2(x))
+        return x
+
+
+class Prior(nn.Module):
+
+    def __init__(self, dim=2048, clip_dim=768, num_heads=32, num_layers=24):
+        super(Prior, self).__init__()
+        self.dim = dim
+        self.clip_dim = clip_dim
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+
+        # embeddings
+        self.text_embedding = nn.Sequential(
+            nn.Linear(clip_dim, dim), nn.SiLU(), nn.Linear(dim, dim))
+        self.time_embedding = nn.Sequential(
+            nn.Linear(dim, dim), nn.SiLU(), nn.Linear(dim, dim))
+        self.vision_embedding = nn.Sequential(
+            nn.Linear(clip_dim, dim), nn.SiLU(), nn.Linear(dim, dim))
+        self.eos_embedding = nn.Parameter(torch.zeros(1, 1, dim))
+        self.pos_embedding = nn.Parameter(torch.zeros(1, 4, dim))
+
+        # transformer
+        self.blocks = nn.ModuleList(
+            [AttentionBlock(dim, num_heads) for _ in range(num_layers)])
+        self.norm = nn.LayerNorm(dim)
+
+        # head
+        self.head = nn.Linear(dim, clip_dim)
+
+        # causal attention mask
+        self.register_buffer('attn_mask', torch.tril(torch.ones(1, 1, 4, 4)))
+
+        # initialize weights
+        self.init_weights()
+
+    def forward(self, x, t, y):
+        r"""x:      [B, C].
+            t:      [B].
+            y:      [B, C].
+        """
+        b = x.size(0)
+
+        # embeddings of shape [B, L + 4, C]
+        u1 = sinusoidal_embedding(t, self.dim)
+        u2 = [
+            self.text_embedding(y).unsqueeze(1),
+            self.time_embedding(u1).unsqueeze(1),
+            self.vision_embedding(x).unsqueeze(1),
+            self.eos_embedding.repeat(b, 1, 1)
+        ]
+        x = self.pos_embedding + torch.cat(u2, dim=1)
+
+        # transformer
+        for block in self.blocks:
+            x = block(x, self.attn_mask)
+        x = self.norm(x)
+
+        # head
+        x = self.head(x[:, -1])
+        return x
+
+    def init_weights(self):
+        std = 0.02 / math.sqrt(2.0 * self.num_layers)
+        for name, m in self.named_modules():
+            if name.endswith('attn.proj') or name.endswith('ffn.2'):
+                # smaller std for output layers
+                nn.init.normal_(m.weight, std=std)
+                nn.init.zeros_(m.bias)
+            elif isinstance(m, (nn.Linear, nn.Embedding)):
+                nn.init.normal_(m.weight, std=0.02)
+                if isinstance(m, nn.Linear) and m.bias is not None:
+                    nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.LayerNorm):
+                nn.init.ones_(m.weight)
+                nn.init.zeros_(m.bias)
+
+    def param_groups(self):
+        groups = [{
+            'params': [
+                p for n, p in self.named_parameters()
+                if 'norm' in n or n.endswith('bias')
+            ],
+            'weight_decay':
+            0.0
+        }, {
+            'params': [
+                p for n, p in self.named_parameters()
+                if not ('norm' in n or n.endswith('bias'))
+            ]
+        }]
+        return groups
diff --git a/modelscope/models/multi_modal/multi_stage_diffusion/tokenizer.py b/modelscope/models/multi_modal/multi_stage_diffusion/tokenizer.py
new file mode 100644
index 00000000..6fd9bebe
--- /dev/null
+++ b/modelscope/models/multi_modal/multi_stage_diffusion/tokenizer.py
@@ -0,0 +1,199 @@
+# The implementation here is modified based on OpenAI CLIP, publicly available at https://github.com/openai/CLIP.
+
+import gzip
+import html
+from functools import lru_cache
+
+import ftfy
+import regex as re
+import torch
+from transformers import AutoTokenizer
+
+__all__ = ['CLIPTokenizer', 'XGLMTokenizer']
+
+
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord('!'),
+                    ord('~') + 1)) + list(range(
+                        ord('¡'),
+                        ord('¬') + 1)) + list(range(ord('®'),
+                                                    ord('ÿ') + 1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+
+
+def whitespace_clean(text):
+    text = re.sub(r'\s+', ' ', text)
+    text = text.strip()
+    return text
+
+
+class SimpleTokenizer(object):
+
+    def __init__(self, bpe_path):
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        merges = gzip.open(bpe_path).read().decode('utf-8').split('\n')
+        merges = merges[1:49152 - 256 - 2 + 1]
+        merges = [tuple(merge.split()) for merge in merges]
+        vocab = list(bytes_to_unicode().values())
+        vocab = vocab + [v + '</w>' for v in vocab]
+        for merge in merges:
+            vocab.append(''.join(merge))
+        vocab.extend(['<|startoftext|>', '<|endoftext|>'])
+        self.encoder = dict(zip(vocab, range(len(vocab))))
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {
+            '<|startoftext|>': '<|startoftext|>',
+            '<|endoftext|>': '<|endoftext|>'
+        }
+        self.pat = re.compile(
+            r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""",
+            re.IGNORECASE)
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token[:-1]) + (token[-1] + '</w>', )
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token + '</w>'
+
+        while True:
+            bigram = min(
+                pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except Exception:
+                    new_word.extend(word[i:])
+                    break
+
+                if word[i] == first and i < len(word) - 1 and word[
+                        i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+
+    def encode(self, text):
+        bpe_tokens = []
+        text = whitespace_clean(basic_clean(text)).lower()
+        for token in re.findall(self.pat, text):
+            token = ''.join(self.byte_encoder[b]
+                            for b in token.encode('utf-8'))
+            bpe_tokens.extend(self.encoder[bpe_token]
+                              for bpe_token in self.bpe(token).split(' '))
+        return bpe_tokens
+
+    def decode(self, tokens):
+        text = ''.join([self.decoder[token] for token in tokens])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode(
+            'utf-8', errors='replace').replace('</w>', ' ')
+        return text
+
+
+class CLIPTokenizer(object):
+    r"""CLIP tokenizer, adapted from https://github.com/openai/CLIP.
+    """
+
+    def __init__(self, bpe_path, length=77):
+        self.bpe_path = bpe_path
+        self.length = length
+
+        # init tokenizer
+        self.tokenizer = SimpleTokenizer(bpe_path=bpe_path)
+        self.sos_token = self.tokenizer.encoder['<|startoftext|>']
+        self.eos_token = self.tokenizer.encoder['<|endoftext|>']
+        self.vocab_size = len(self.tokenizer.encoder)
+
+    def __call__(self, sequence):
+        if isinstance(sequence, str):
+            return torch.LongTensor(self._tokenizer(sequence))
+        elif isinstance(sequence, list):
+            return torch.LongTensor([self._tokenizer(u) for u in sequence])
+        else:
+            raise TypeError(
+                f'Expected the "sequence" to be a string or a list, but got {type(sequence)}'
+            )
+
+    def _tokenizer(self, text):
+        tokens = self.tokenizer.encode(text)[:self.length - 2]
+        tokens = [self.sos_token] + tokens + [self.eos_token]
+        tokens = tokens + [0] * (self.length - len(tokens))
+        return tokens
+
+
+class XGLMTokenizer(object):
+    r"""A wrapper of HuggingFace's XGLM tokenizer.
+    """
+
+    def __init__(self, model_dir, length=77, **kwargs):
+        self.length = length
+        self.tokenizer = AutoTokenizer.from_pretrained(model_dir, **kwargs)
+        self.vocab_size = self.tokenizer.vocab_size
+
+    def __call__(self, sequence, **kwargs):
+        _kwargs = {
+            'return_tensors': 'pt',
+            'padding': 'max_length',
+            'truncation': True,
+            'max_length': self.length
+        }
+        _kwargs.update(**kwargs)
+        tokens = self.tokenizer(sequence, **_kwargs)
+        return tokens.input_ids, tokens.attention_mask
diff --git a/modelscope/models/multi_modal/multi_stage_diffusion/upsampler.py b/modelscope/models/multi_modal/multi_stage_diffusion/upsampler.py
new file mode 100644
index 00000000..4e99a514
--- /dev/null
+++ b/modelscope/models/multi_modal/multi_stage_diffusion/upsampler.py
@@ -0,0 +1,466 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+__all__ = ['Upsampler256', 'Upsampler1024']
+
+
+def sinusoidal_embedding(timesteps, dim):
+    # check input
+    half = dim // 2
+    timesteps = timesteps.float()
+
+    # compute sinusoidal embedding
+    sinusoid = torch.outer(
+        timesteps, torch.pow(10000,
+                             -torch.arange(half).to(timesteps).div(half)))
+    x = torch.cat([torch.cos(sinusoid), torch.sin(sinusoid)], dim=1)
+    if dim % 2 != 0:
+        x = torch.cat([x, torch.zeros_like(x[:, :1])], dim=1)
+    return x
+
+
+class Resample(nn.Module):
+
+    def __init__(self, in_dim, out_dim, scale_factor, use_conv=False):
+        assert scale_factor in [0.5, 1.0, 2.0]
+        super(Resample, self).__init__()
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        self.scale_factor = scale_factor
+        self.use_conv = use_conv
+
+        # layers
+        if scale_factor == 2.0:
+            self.resample = nn.Sequential(
+                nn.Upsample(scale_factor=scale_factor, mode='nearest'),
+                nn.Conv2d(in_dim, out_dim, 3, padding=1)
+                if use_conv else nn.Identity())
+        elif scale_factor == 0.5:
+            self.resample = nn.Conv2d(
+                in_dim, out_dim, 3, stride=2,
+                padding=1) if use_conv else nn.AvgPool2d(
+                    kernel_size=2, stride=2)
+        else:
+            self.resample = nn.Identity()
+
+    def forward(self, x):
+        return self.resample(x)
+
+
+class ResidualBlock(nn.Module):
+
+    def __init__(self,
+                 in_dim,
+                 embed_dim,
+                 out_dim,
+                 use_scale_shift_norm=True,
+                 scale_factor=1.0,
+                 dropout=0.0):
+        super(ResidualBlock, self).__init__()
+        self.in_dim = in_dim
+        self.embed_dim = embed_dim
+        self.out_dim = out_dim
+        self.use_scale_shift_norm = use_scale_shift_norm
+        self.scale_factor = scale_factor
+
+        # layers
+        self.layer1 = nn.Sequential(
+            nn.GroupNorm(32, in_dim), nn.SiLU(),
+            nn.Conv2d(in_dim, out_dim, 3, padding=1))
+        self.resample = Resample(in_dim, in_dim, scale_factor, use_conv=False)
+        self.embedding = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(embed_dim,
+                      out_dim * 2 if use_scale_shift_norm else out_dim))
+        self.layer2 = nn.Sequential(
+            nn.GroupNorm(32, out_dim), nn.SiLU(), nn.Dropout(dropout),
+            nn.Conv2d(out_dim, out_dim, 3, padding=1))
+        self.shortcut = nn.Identity() if in_dim == out_dim else nn.Conv2d(
+            in_dim, out_dim, 1)
+
+        # zero out the last layer params
+        nn.init.zeros_(self.layer2[-1].weight)
+
+    def forward(self, x, e):
+        identity = self.resample(x)
+        x = self.layer1[-1](self.resample(self.layer1[:-1](x)))
+        e = self.embedding(e).unsqueeze(-1).unsqueeze(-1).type(x.dtype)
+        if self.use_scale_shift_norm:
+            scale, shift = e.chunk(2, dim=1)
+            x = self.layer2[0](x) * (1 + scale) + shift
+            x = self.layer2[1:](x)
+        else:
+            x = x + e
+            x = self.layer2(x)
+        x = x + self.shortcut(identity)
+        return x
+
+
+class AttentionBlock(nn.Module):
+
+    def __init__(self, dim, context_dim=None, num_heads=None, head_dim=None):
+        # consider head_dim first, then num_heads
+        num_heads = dim // head_dim if head_dim else num_heads
+        head_dim = dim // num_heads
+        assert num_heads * head_dim == dim
+        super(AttentionBlock, self).__init__()
+        self.dim = dim
+        self.context_dim = context_dim
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+        self.scale = math.pow(head_dim, -0.25)
+
+        # layers
+        self.norm = nn.GroupNorm(32, dim)
+        self.to_qkv = nn.Conv2d(dim, dim * 3, 1)
+        if context_dim is not None:
+            self.context_kv = nn.Linear(context_dim, dim * 2)
+        self.proj = nn.Conv2d(dim, dim, 1)
+
+        # zero out the last layer params
+        nn.init.zeros_(self.proj.weight)
+
+    def forward(self, x, context=None):
+        r"""x:       [B, C, H, W].
+            context: [B, L, C] or None.
+        """
+        identity = x
+        b, c, h, w, n, d = *x.size(), self.num_heads, self.head_dim
+
+        # compute query, key, value
+        x = self.norm(x)
+        q, k, v = self.to_qkv(x).view(b, n * 3, d, h * w).chunk(3, dim=1)
+        if context is not None:
+            ck, cv = self.context_kv(context).reshape(b, -1, n * 2,
+                                                      d).permute(0, 2, 3,
+                                                                 1).chunk(
+                                                                     2, dim=1)
+            k = torch.cat([ck, k], dim=-1)
+            v = torch.cat([cv, v], dim=-1)
+
+        # compute attention
+        attn = torch.matmul(q.transpose(-1, -2) * self.scale, k * self.scale)
+        attn = F.softmax(attn, dim=-1)
+
+        # gather context
+        x = torch.matmul(v, attn.transpose(-1, -2))
+        x = x.reshape(b, c, h, w)
+
+        # output
+        x = self.proj(x)
+        return x + identity
+
+
+class Upsampler256(nn.Module):
+
+    def __init__(self,
+                 in_dim=6,
+                 dim=320,
+                 y_dim=768,
+                 context_dim=512,
+                 out_dim=3,
+                 dim_mult=[1, 2, 3, 4],
+                 num_heads=None,
+                 head_dim=64,
+                 num_res_blocks=3,
+                 attn_scales=[1 / 8],
+                 resblock_resample=True,
+                 use_scale_shift_norm=True,
+                 dropout=0.1):
+        embed_dim = dim * 4
+        super(Upsampler256, self).__init__()
+        self.in_dim = in_dim
+        self.dim = dim
+        self.y_dim = y_dim
+        self.context_dim = context_dim
+        self.embed_dim = embed_dim
+        self.out_dim = out_dim
+        self.dim_mult = dim_mult
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.resblock_resample = resblock_resample
+        self.use_scale_shift_norm = use_scale_shift_norm
+
+        # params
+        enc_dims = [dim * u for u in [1] + dim_mult]
+        dec_dims = [dim * u for u in [dim_mult[-1]] + dim_mult[::-1]]
+        shortcut_dims = []
+        scale = 1.0
+
+        # embeddings
+        self.time_embedding = nn.Sequential(
+            nn.Linear(dim, embed_dim), nn.SiLU(),
+            nn.Linear(embed_dim, embed_dim))
+        self.y_embedding = nn.Sequential(
+            nn.Linear(y_dim, embed_dim), nn.SiLU(),
+            nn.Linear(embed_dim, embed_dim))
+        self.context_embedding = nn.Sequential(
+            nn.Linear(y_dim, embed_dim), nn.SiLU(),
+            nn.Linear(embed_dim, context_dim * 4))
+
+        # encoder
+        self.encoder = nn.ModuleList(
+            [nn.Conv2d(self.in_dim, dim, 3, padding=1)])
+        shortcut_dims.append(dim)
+        for i, (in_dim,
+                out_dim) in enumerate(zip(enc_dims[:-1], enc_dims[1:])):
+            for j in range(num_res_blocks):
+                # residual (+attention) blocks
+                block = nn.ModuleList([
+                    ResidualBlock(in_dim, embed_dim, out_dim,
+                                  use_scale_shift_norm, 1.0, dropout)
+                ])
+                if scale in attn_scales:
+                    block.append(
+                        AttentionBlock(out_dim, context_dim, num_heads,
+                                       head_dim))
+                in_dim = out_dim
+                self.encoder.append(block)
+                shortcut_dims.append(out_dim)
+
+                # downsample
+                if i != len(dim_mult) - 1 and j == num_res_blocks - 1:
+                    if resblock_resample:
+                        downsample = ResidualBlock(out_dim, embed_dim, out_dim,
+                                                   use_scale_shift_norm, 0.5,
+                                                   dropout)
+                    else:
+                        downsample = Resample(
+                            out_dim, out_dim, 0.5, use_conv=True)
+                    shortcut_dims.append(out_dim)
+                    scale /= 2.0
+                    self.encoder.append(downsample)
+
+        # middle
+        self.middle = nn.ModuleList([
+            ResidualBlock(out_dim, embed_dim, out_dim, use_scale_shift_norm,
+                          1.0, dropout),
+            AttentionBlock(out_dim, context_dim, num_heads, head_dim),
+            ResidualBlock(out_dim, embed_dim, out_dim, use_scale_shift_norm,
+                          1.0, dropout)
+        ])
+
+        # decoder
+        self.decoder = nn.ModuleList()
+        for i, (in_dim,
+                out_dim) in enumerate(zip(dec_dims[:-1], dec_dims[1:])):
+            for j in range(num_res_blocks + 1):
+                # residual (+attention) blocks
+                block = nn.ModuleList([
+                    ResidualBlock(in_dim + shortcut_dims.pop(), embed_dim,
+                                  out_dim, use_scale_shift_norm, 1.0, dropout)
+                ])
+                if scale in attn_scales:
+                    block.append(
+                        AttentionBlock(out_dim, context_dim, num_heads,
+                                       head_dim))
+                in_dim = out_dim
+
+                # upsample
+                if i != len(dim_mult) - 1 and j == num_res_blocks:
+                    if resblock_resample:
+                        upsample = ResidualBlock(out_dim, embed_dim, out_dim,
+                                                 use_scale_shift_norm, 2.0,
+                                                 dropout)
+                    else:
+                        upsample = Resample(
+                            out_dim, out_dim, 2.0, use_conv=True)
+                    scale *= 2.0
+                    block.append(upsample)
+                self.decoder.append(block)
+
+        # head
+        self.head = nn.Sequential(
+            nn.GroupNorm(32, out_dim), nn.SiLU(),
+            nn.Conv2d(out_dim, self.out_dim, 3, padding=1))
+
+        # zero out the last layer params
+        nn.init.zeros_(self.head[-1].weight)
+
+    def forward(self, x, t, y, concat):
+        # embeddings
+        x = torch.cat([x, concat], dim=1)
+        e = self.time_embedding(sinusoidal_embedding(
+            t, self.dim)) + self.y_embedding(y)
+        context = self.context_embedding(y).view(-1, 4, self.context_dim)
+
+        # encoder
+        xs = []
+        for block in self.encoder:
+            x = self._forward_single(block, x, e, context)
+            xs.append(x)
+
+        # middle
+        for block in self.middle:
+            x = self._forward_single(block, x, e, context)
+
+        # decoder
+        for block in self.decoder:
+            x = torch.cat([x, xs.pop()], dim=1)
+            x = self._forward_single(block, x, e, context)
+
+        # head
+        x = self.head(x)
+        return x
+
+    def _forward_single(self, module, x, e, context):
+        if isinstance(module, ResidualBlock):
+            x = module(x, e)
+        elif isinstance(module, AttentionBlock):
+            x = module(x, context)
+        elif isinstance(module, nn.ModuleList):
+            for block in module:
+                x = self._forward_single(block, x, e, context)
+        else:
+            x = module(x)
+        return x
+
+
+class Upsampler1024(nn.Module):
+
+    def __init__(self,
+                 in_dim=6,
+                 dim=192,
+                 y_dim=768,
+                 out_dim=3,
+                 dim_mult=[1, 1, 2, 2, 4, 4],
+                 num_res_blocks=2,
+                 resblock_resample=True,
+                 use_scale_shift_norm=True,
+                 dropout=0.0):
+        embed_dim = dim * 4
+        super(Upsampler1024, self).__init__()
+        self.in_dim = in_dim
+        self.dim = dim
+        self.y_dim = y_dim
+        self.out_dim = out_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.resblock_resample = resblock_resample
+        self.use_scale_shift_norm = use_scale_shift_norm
+
+        # params
+        enc_dims = [dim * u for u in [1] + dim_mult]
+        dec_dims = [dim * u for u in [dim_mult[-1]] + dim_mult[::-1]]
+        shortcut_dims = []
+        scale = 1.0
+
+        # embedding
+        self.time_embedding = nn.Sequential(
+            nn.Linear(dim, embed_dim), nn.SiLU(),
+            nn.Linear(embed_dim, embed_dim))
+        self.y_embedding = nn.Sequential(
+            nn.Linear(y_dim, embed_dim), nn.SiLU(),
+            nn.Linear(embed_dim, embed_dim))
+
+        # encoder
+        self.encoder = nn.ModuleList(
+            [nn.Conv2d(self.in_dim, dim, 3, padding=1)])
+        shortcut_dims.append(dim)
+        for i, (in_dim,
+                out_dim) in enumerate(zip(enc_dims[:-1], enc_dims[1:])):
+            for j in range(num_res_blocks):
+                # residual block
+                block = nn.ModuleList([
+                    ResidualBlock(in_dim, embed_dim, out_dim,
+                                  use_scale_shift_norm, 1.0, dropout)
+                ])
+                shortcut_dims.append(out_dim)
+                in_dim = out_dim
+                self.encoder.append(block)
+
+                # downsample
+                if i != len(dim_mult) - 1 and j == num_res_blocks - 1:
+                    if resblock_resample:
+                        downsample = ResidualBlock(out_dim, embed_dim, out_dim,
+                                                   use_scale_shift_norm, 0.5,
+                                                   dropout)
+                    else:
+                        downsample = Resample(
+                            out_dim, out_dim, 0.5, use_conv=True)
+                    shortcut_dims.append(out_dim)
+                    scale /= 2.0
+                    self.encoder.append(downsample)
+
+        # middle
+        self.middle = nn.ModuleList([
+            ResidualBlock(out_dim, embed_dim, out_dim, use_scale_shift_norm,
+                          1.0, dropout),
+            ResidualBlock(out_dim, embed_dim, out_dim, use_scale_shift_norm,
+                          1.0, dropout)
+        ])
+
+        # decoder
+        self.decoder = nn.ModuleList()
+        for i, (in_dim,
+                out_dim) in enumerate(zip(dec_dims[:-1], dec_dims[1:])):
+            for j in range(num_res_blocks + 1):
+                # residual block
+                block = nn.ModuleList([
+                    ResidualBlock(in_dim + shortcut_dims.pop(), embed_dim,
+                                  out_dim, use_scale_shift_norm, 1.0, dropout)
+                ])
+                in_dim = out_dim
+
+                # upsample
+                if i != len(dim_mult) - 1 and j == num_res_blocks:
+                    if resblock_resample:
+                        upsample = ResidualBlock(out_dim, embed_dim, out_dim,
+                                                 use_scale_shift_norm, 2.0,
+                                                 dropout)
+                    else:
+                        upsample = Resample(
+                            out_dim, out_dim, 2.0, use_conv=True)
+                    scale *= 2.0
+                    block.append(upsample)
+                self.decoder.append(block)
+
+        # head
+        self.head = nn.Sequential(
+            nn.GroupNorm(32, out_dim), nn.SiLU(),
+            nn.Conv2d(out_dim, self.out_dim, 3, padding=1))
+
+        # zero out the last layer params
+        nn.init.zeros_(self.head[-1].weight)
+
+    def forward(self, x, t, y, concat):
+        # embedding
+        x = torch.cat([x, concat], dim=1)
+        e = self.time_embedding(sinusoidal_embedding(
+            t, self.dim)) + self.y_embedding(y)
+
+        # encoder
+        xs = []
+        for block in self.encoder:
+            x = self._forward_single(block, x, e)
+            xs.append(x)
+
+        # middle
+        for block in self.middle:
+            x = self._forward_single(block, x, e)
+
+        # decoder
+        for block in self.decoder:
+            x = torch.cat([x, xs.pop()], dim=1)
+            x = self._forward_single(block, x, e)
+
+        # head
+        x = self.head(x)
+        return x
+
+    def _forward_single(self, module, x, e):
+        if isinstance(module, ResidualBlock):
+            x = module(x, e)
+        elif isinstance(module, nn.ModuleList):
+            for block in module:
+                x = self._forward_single(block, x, e)
+        else:
+            x = module(x)
+        return x
diff --git a/modelscope/models/multi_modal/multi_stage_diffusion/xglm.py b/modelscope/models/multi_modal/multi_stage_diffusion/xglm.py
new file mode 100644
index 00000000..8a0b3ff1
--- /dev/null
+++ b/modelscope/models/multi_modal/multi_stage_diffusion/xglm.py
@@ -0,0 +1,205 @@
+# The implementation here is modified based on HuggingFace XGLM, publicly available
+# at https://github.com/huggingface/transformers.
+
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+__all__ = ['XGLM']
+
+
+def sinusoidal_embedding(seq_len, dim, pad_token=None):
+    half = dim // 2
+    sinusoid = torch.outer(
+        torch.arange(seq_len, dtype=torch.float32),
+        torch.pow(10000,
+                  -torch.arange(half, dtype=torch.float32).div(half - 1)))
+    x = torch.cat([torch.sin(sinusoid), torch.cos(sinusoid)], dim=1)
+    if dim % 2 == 1:
+        x = torch.cat([x, torch.zeros_like(x[:, :1])], dim=1)
+    if pad_token is not None:
+        x[pad_token, :] = 0
+    return x
+
+
+class SinusoidalEmbedding(nn.Module):
+
+    def __init__(self, seq_len, dim, pad_token):
+        super(SinusoidalEmbedding, self).__init__()
+        self.seq_len = seq_len
+        self.dim = dim
+        self.pad_token = pad_token
+        self.register_buffer('weight',
+                             sinusoidal_embedding(seq_len + 2, dim, pad_token))
+
+    def forward(self, tokens):
+        mask = tokens.ne(self.pad_token).long()
+        indices = torch.cumsum(mask, dim=1) * mask + self.pad_token
+        pos_embeds = self.weight.index_select(0, indices.view(-1)).view(
+            *tokens.shape, -1)
+        return pos_embeds
+
+
+class GELU(nn.Module):
+
+    def forward(self, x):
+        return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
+
+
+class SelfAttention(nn.Module):
+
+    def __init__(self, dim, num_heads, dropout=0.1):
+        assert dim % num_heads == 0
+        super(SelfAttention, self).__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = 1.0 / math.sqrt(self.head_dim)
+
+        # layers
+        self.q = nn.Linear(dim, dim)
+        self.k = nn.Linear(dim, dim)
+        self.v = nn.Linear(dim, dim)
+        self.o = nn.Linear(dim, dim)
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x, mask=None):
+        r"""x:      [B, L, C].
+            mask:   [B, *, L, L] or None.
+        """
+        b, l, n, c = *x.shape[:2], self.num_heads, self.head_dim
+
+        # compute query, key, value
+        q = self.q(x).view(b, l, n, c)
+        k = self.k(x).view(b, l, n, c)
+        v = self.v(x).view(b, l, n, c)
+
+        # compute attention
+        attn = self.scale * torch.einsum('binc,bjnc->bnij', q, k)
+        if mask is not None:
+            attn = attn.masked_fill(mask == 0, float('-inf'))
+        attn = F.softmax(attn, dim=-1)
+        attn = self.dropout(attn)
+
+        # gather context
+        x = torch.einsum('bnij,bjnc->binc', attn, v)
+        x = x.reshape(b, l, -1)
+
+        # output
+        x = self.o(x)
+        x = self.dropout(x)
+        return x
+
+
+class AttentionBlock(nn.Module):
+
+    def __init__(self, dim, ffn_dim, ffn_act, num_heads, dropout=0.1):
+        assert ffn_act in ['gelu', 'relu']
+        super(AttentionBlock, self).__init__()
+        self.dim = dim
+        self.ffn_dim = ffn_dim
+        self.ffn_act = ffn_act
+        self.num_heads = num_heads
+
+        # layers
+        self.norm1 = nn.LayerNorm(dim)
+        self.attn = SelfAttention(dim, num_heads, dropout)
+        self.norm2 = nn.LayerNorm(dim)
+        self.ffn = nn.Sequential(
+            nn.Linear(dim, ffn_dim),
+            GELU() if ffn_act == 'gelu' else nn.ReLU(inplace=True),
+            nn.Linear(ffn_dim, dim), nn.Dropout(dropout))
+
+    def forward(self, x, mask=None):
+        x = x + self.attn(self.norm1(x), mask)
+        x = x + self.ffn(self.norm2(x))
+        return x
+
+
+class XGLM(nn.Module):
+    r"""A multilingual GPT model with an embedding head.
+    """
+
+    def __init__(self,
+                 vocab_size=256008,
+                 max_seq_len=2048,
+                 dim=1024,
+                 ffn_dim=4096,
+                 ffn_act='gelu',
+                 embed_dim=768,
+                 num_heads=16,
+                 num_layers=24,
+                 pad_token=1,
+                 dropout=0.1):
+        super(XGLM, self).__init__()
+        self.vocab_size = vocab_size
+        self.max_seq_len = max_seq_len
+        self.dim = dim
+        self.ffn_dim = ffn_dim
+        self.ffn_act = ffn_act
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.pad_token = pad_token
+        self.scale = math.sqrt(dim)  # rescale token embedings
+
+        # layers
+        self.token_embedding = nn.Embedding(vocab_size, dim, pad_token)
+        self.pos_embedding = SinusoidalEmbedding(max_seq_len, dim, pad_token)
+        self.eos_embedding = nn.Parameter(torch.randn(1, 1, dim))
+        self.dropout = nn.Dropout(dropout)
+        self.blocks = nn.ModuleList([
+            AttentionBlock(dim, ffn_dim, ffn_act, num_heads, dropout)
+            for _ in range(num_layers)
+        ])
+        self.norm = nn.LayerNorm(dim)
+        self.head = nn.Linear(dim, embed_dim, bias=False)
+
+        # causal attention mask
+        self.register_buffer(
+            'attn_mask',
+            torch.tril(torch.ones(1, 1, 1 + max_seq_len, 1 + max_seq_len)))
+
+        # init weights
+        self.apply(self.init_weights)
+
+    def forward(self, tokens, mask=None):
+        r"""tokens: [B, L].
+            mask:   [B, L].
+        """
+        b, seq_len = tokens.size(0), 1 + tokens.size(1)
+
+        # embeddings
+        x = self.scale * self.token_embedding(tokens)
+        x = torch.cat([x, self.eos_embedding.repeat(b, 1, 1)], dim=1)
+        # x = x + self.pos_embedding(tokens)
+        x = self.dropout(x)
+
+        # attention mask
+        if mask is None:
+            mask = self.attn_mask[:, :, :seq_len, :seq_len].repeat(b, 1, 1, 1)
+        else:
+            mask = self.attn_mask[:, :, :seq_len, :seq_len] * torch.cat(
+                [mask, torch.zeros_like(mask[:, :1])], dim=1).view(
+                    b, 1, 1, seq_len)
+
+        # transformer
+        for block in self.blocks:
+            x = block(x, mask)
+        x = self.norm(x)
+
+        # head
+        logits = self.head(x[:, -1])
+        return logits
+
+    def init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            nn.init.normal_(m.weight, std=0.02)
+            if m.bias is not None:
+                nn.init.zeros_(m.bias)
+        elif isinstance(m, nn.Embedding):
+            nn.init.normal_(m.weight, std=0.02)
+            if m.padding_idx is not None:
+                nn.init.zeros_(m.weight[m.padding_idx])
diff --git a/modelscope/models/nlp/__init__.py b/modelscope/models/nlp/__init__.py
index fd61e40b..443cb214 100644
--- a/modelscope/models/nlp/__init__.py
+++ b/modelscope/models/nlp/__init__.py
@@ -5,39 +5,43 @@ from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
     from .backbones import SbertModel
-    from .heads import SequenceClassificationHead
+    from .bart_for_text_error_correction import BartForTextErrorCorrection
     from .bert_for_sequence_classification import BertForSequenceClassification
     from .bert_for_document_segmentation import BertForDocumentSegmentation
     from .csanmt_for_translation import CsanmtForTranslation
-    from .masked_language import (
-        StructBertForMaskedLM,
-        VecoForMaskedLM,
-        BertForMaskedLM,
-        DebertaV2ForMaskedLM,
-    )
+    from .heads import SequenceClassificationHead
+    from .gpt3 import GPT3ForTextGeneration
+    from .masked_language import (StructBertForMaskedLM, VecoForMaskedLM,
+                                  BertForMaskedLM, DebertaV2ForMaskedLM)
+    from .ponet_for_masked_language import PoNetForMaskedLM
     from .nncrf_for_named_entity_recognition import (
         TransformerCRFForNamedEntityRecognition,
         LSTMCRFForNamedEntityRecognition)
-    from .token_classification import SbertForTokenClassification
+    from .palm_v2 import PalmForTextGeneration
+    from .sbert_for_faq_question_answering import SbertForFaqQuestionAnswering
+    from .star_text_to_sql import StarForTextToSql
     from .sequence_classification import VecoForSequenceClassification, SbertForSequenceClassification
     from .space import SpaceForDialogIntent
     from .space import SpaceForDialogModeling
     from .space import SpaceForDialogStateTracking
-    from .star_text_to_sql import StarForTextToSql
+    from .table_question_answering import TableQuestionAnswering
     from .task_models import (InformationExtractionModel,
-                              SingleBackboneTaskModelBase)
-    from .bart_for_text_error_correction import BartForTextErrorCorrection
-    from .gpt3 import GPT3ForTextGeneration
-    from .sbert_for_faq_question_answering import SbertForFaqQuestionAnswering
+                              SequenceClassificationModel,
+                              SingleBackboneTaskModelBase,
+                              TokenClassificationModel)
+    from .token_classification import SbertForTokenClassification
+    from .sentence_embedding import SentenceEmbedding
+    from .passage_ranking import PassageRanking
 
 else:
     _import_structure = {
-        'star_text_to_sql': ['StarForTextToSql'],
         'backbones': ['SbertModel'],
-        'heads': ['SequenceClassificationHead'],
-        'csanmt_for_translation': ['CsanmtForTranslation'],
+        'bart_for_text_error_correction': ['BartForTextErrorCorrection'],
         'bert_for_sequence_classification': ['BertForSequenceClassification'],
         'bert_for_document_segmentation': ['BertForDocumentSegmentation'],
+        'csanmt_for_translation': ['CsanmtForTranslation'],
+        'heads': ['SequenceClassificationHead'],
+        'gpt3': ['GPT3ForTextGeneration'],
         'masked_language': [
             'StructBertForMaskedLM', 'VecoForMaskedLM', 'BertForMaskedLM',
             'DebertaV2ForMaskedLM'
@@ -46,8 +50,10 @@ else:
             'TransformerCRFForNamedEntityRecognition',
             'LSTMCRFForNamedEntityRecognition'
         ],
+        'ponet_for_masked_language': ['PoNetForMaskedLM'],
         'palm_v2': ['PalmForTextGeneration'],
-        'token_classification': ['SbertForTokenClassification'],
+        'sbert_for_faq_question_answering': ['SbertForFaqQuestionAnswering'],
+        'star_text_to_sql': ['StarForTextToSql'],
         'sequence_classification':
         ['VecoForSequenceClassification', 'SbertForSequenceClassification'],
         'space': [
@@ -56,11 +62,12 @@ else:
         ],
         'task_models': [
             'InformationExtractionModel', 'SequenceClassificationModel',
-            'SingleBackboneTaskModelBase'
+            'SingleBackboneTaskModelBase', 'TokenClassificationModel'
         ],
-        'bart_for_text_error_correction': ['BartForTextErrorCorrection'],
-        'gpt3': ['GPT3ForTextGeneration'],
-        'sbert_for_faq_question_answering': ['SbertForFaqQuestionAnswering'],
+        'token_classification': ['SbertForTokenClassification'],
+        'table_question_answering': ['TableQuestionAnswering'],
+        'sentence_embedding': ['SentenceEmbedding'],
+        'passage_ranking': ['PassageRanking'],
     }
 
     import sys
diff --git a/modelscope/models/nlp/heads/sequence_classification_head.py b/modelscope/models/nlp/heads/sequence_classification_head.py
index 92f3a4ec..e608f035 100644
--- a/modelscope/models/nlp/heads/sequence_classification_head.py
+++ b/modelscope/models/nlp/heads/sequence_classification_head.py
@@ -19,7 +19,6 @@ class SequenceClassificationHead(TorchHead):
         super().__init__(**kwargs)
         config = self.config
         self.num_labels = config.num_labels
-        self.config = config
         classifier_dropout = (
             config['classifier_dropout'] if config.get('classifier_dropout')
             is not None else config['hidden_dropout_prob'])
diff --git a/modelscope/models/nlp/heads/token_classification_head.py b/modelscope/models/nlp/heads/token_classification_head.py
new file mode 100644
index 00000000..481524ae
--- /dev/null
+++ b/modelscope/models/nlp/heads/token_classification_head.py
@@ -0,0 +1,42 @@
+from typing import Dict
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from modelscope.metainfo import Heads
+from modelscope.models.base import TorchHead
+from modelscope.models.builder import HEADS
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import Tasks
+
+
+@HEADS.register_module(
+    Tasks.token_classification, module_name=Heads.token_classification)
+class TokenClassificationHead(TorchHead):
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        config = self.config
+        self.num_labels = config.num_labels
+        classifier_dropout = (
+            config['classifier_dropout'] if config.get('classifier_dropout')
+            is not None else config['hidden_dropout_prob'])
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config['hidden_size'],
+                                    config['num_labels'])
+
+    def forward(self, inputs=None):
+        if isinstance(inputs, dict):
+            assert inputs.get('sequence_output') is not None
+            sequence_output = inputs.get('sequence_output')
+        else:
+            sequence_output = inputs
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+        return {OutputKeys.LOGITS: logits}
+
+    def compute_loss(self, outputs: Dict[str, torch.Tensor],
+                     labels) -> Dict[str, torch.Tensor]:
+        logits = outputs[OutputKeys.LOGITS]
+        return {OutputKeys.LOSS: F.cross_entropy(logits, labels)}
diff --git a/modelscope/models/nlp/passage_ranking.py b/modelscope/models/nlp/passage_ranking.py
new file mode 100644
index 00000000..68bca231
--- /dev/null
+++ b/modelscope/models/nlp/passage_ranking.py
@@ -0,0 +1,78 @@
+from typing import Any, Dict
+
+import numpy as np
+import torch
+
+from modelscope.metainfo import Models
+from modelscope.models import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.models.nlp import SbertForSequenceClassification
+from modelscope.models.nlp.structbert import SbertPreTrainedModel
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import Tasks
+
+__all__ = ['PassageRanking']
+
+
+@MODELS.register_module(Tasks.passage_ranking, module_name=Models.bert)
+class PassageRanking(SbertForSequenceClassification, SbertPreTrainedModel):
+    base_model_prefix: str = 'bert'
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [r'position_ids']
+
+    def __init__(self, config, model_dir, *args, **kwargs):
+        if hasattr(config, 'base_model_prefix'):
+            PassageRanking.base_model_prefix = config.base_model_prefix
+        super().__init__(config, model_dir)
+        self.train_batch_size = kwargs.get('train_batch_size', 4)
+        self.register_buffer(
+            'target_label',
+            torch.zeros(self.train_batch_size, dtype=torch.long))
+
+    def build_base_model(self):
+        from .structbert import SbertModel
+        return SbertModel(self.config, add_pooling_layer=True)
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]:
+        outputs = self.base_model.forward(**input)
+
+        # backbone model should return pooled_output as its second output
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        if self.base_model.training:
+            scores = logits.view(self.train_batch_size, -1)
+            loss_fct = torch.nn.CrossEntropyLoss()
+            loss = loss_fct(scores, self.target_label)
+            return {OutputKeys.LOGITS: logits, OutputKeys.LOSS: loss}
+        return {OutputKeys.LOGITS: logits}
+
+    def sigmoid(self, logits):
+        return np.exp(logits) / (1 + np.exp(logits))
+
+    def postprocess(self, inputs: Dict[str, np.ndarray],
+                    **kwargs) -> Dict[str, np.ndarray]:
+        logits = inputs['logits'].squeeze(-1).detach().cpu().numpy()
+        logits = self.sigmoid(logits).tolist()
+        result = {OutputKeys.SCORES: logits}
+        return result
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        """Instantiate the model.
+
+        @param kwargs: Input args.
+                    model_dir: The model dir used to load the checkpoint and the label information.
+                    num_labels: An optional arg to tell the model how many classes to initialize.
+                                    Method will call utils.parse_label_mapping if num_labels not supplied.
+                                    If num_labels is not found, the model will use the default setting (1 classes).
+        @return: The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
+        """
+
+        num_labels = kwargs.get('num_labels', 1)
+        model_args = {} if num_labels is None else {'num_labels': num_labels}
+
+        return super(SbertPreTrainedModel, PassageRanking).from_pretrained(
+            pretrained_model_name_or_path=kwargs.get('model_dir'),
+            model_dir=kwargs.get('model_dir'),
+            **model_args)
diff --git a/modelscope/models/nlp/plug/__init__.py b/modelscope/models/nlp/plug/__init__.py
new file mode 100644
index 00000000..dbc20751
--- /dev/null
+++ b/modelscope/models/nlp/plug/__init__.py
@@ -0,0 +1,25 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .configuration_plug import PlugNLGConfig
+    from .modeling_plug import PlugModel
+    from .distributed_plug import DistributedPlug
+else:
+    _import_structure = {
+        'configuration_plug': ['PlugNLGConfig'],
+        'modeling_plug': ['PlugModel'],
+        'distributed_plug': ['DistributedPlug'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/nlp/plug/configuration_plug.py b/modelscope/models/nlp/plug/configuration_plug.py
new file mode 100644
index 00000000..64807392
--- /dev/null
+++ b/modelscope/models/nlp/plug/configuration_plug.py
@@ -0,0 +1,232 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+
+import json
+from transformers import PretrainedConfig
+
+from modelscope.utils import logger as logging
+
+logger = logging.get_logger(__name__)
+
+
+class PlugNLUConfig(PretrainedConfig):
+    model_type = 'plugNLU'
+
+    def __init__(self,
+                 vocab_size=21504,
+                 original_vocab_size=21128,
+                 hidden_size=8192,
+                 num_hidden_layers=24,
+                 num_attention_heads=128,
+                 intermediate_size=32768,
+                 hidden_act='gelu',
+                 hidden_dropout_prob=0.1,
+                 attention_probs_dropout_prob=0.1,
+                 max_position_embeddings=2048,
+                 type_vocab_size=3,
+                 initializer_range=0.00707,
+                 deep_init=False,
+                 deepspeed=False,
+                 lr_decay_style='linear',
+                 weight_decay=1e-2,
+                 clip_grad=1.0,
+                 warmup=0.0333,
+                 pre_ln=True,
+                 fp16=True,
+                 fp32_layernorm=True,
+                 fp32_embedding=False,
+                 fp32_tokentypes=False,
+                 layernorm_epsilon=1e-5,
+                 dec_hidden_layers=6,
+                 pruning_method=None,
+                 pruning_mask_init='constant',
+                 pruning_mask_scale=0.0,
+                 pruning_initial_threshold=1.0,
+                 pruning_final_threshold=0.01,
+                 pruning_initial_warmup=1,
+                 pruning_final_warmup=20,
+                 pruning_module='decoder',
+                 pruning_decay_step=50,
+                 pruning_decay_type='exp',
+                 ft_module=None,
+                 attn_separate=False,
+                 LR_weight_rank=8,
+                 LR_mask_rank=8,
+                 **kwargs):
+        super().__init__(layer_norm_eps=layernorm_epsilon, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.original_vocab_size = original_vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.deep_init = deep_init
+        self.deepspeed = deepspeed
+        self.lr_decay_style = lr_decay_style
+        self.weight_decay = weight_decay
+        self.clip_grad = clip_grad
+        self.warmup = warmup
+        self.pre_ln = pre_ln
+        self.fp16 = fp16
+        self.fp32_layernorm = fp32_layernorm
+        self.fp32_embedding = fp32_embedding
+        self.layernorm_epsilon = layernorm_epsilon
+        self.fp32_tokentypes = fp32_tokentypes
+        self.dec_hidden_layers = dec_hidden_layers
+        self.pruning_method = pruning_method
+        self.pruning_mask_init = pruning_mask_init
+        self.pruning_mask_scale = pruning_mask_scale
+        self.pruning_module = pruning_module
+        self.pruning_initial_threshold = pruning_initial_threshold
+        self.pruning_final_threshold = pruning_final_threshold
+        self.pruning_initial_warmup = pruning_initial_warmup
+        self.pruning_final_warmup = pruning_final_warmup
+        self.pruning_decay_step = pruning_decay_step
+        self.pruning_decay_type = pruning_decay_type
+        self.ft_module = ft_module
+        self.attn_separate = attn_separate
+        self.LR_weight_rank = LR_weight_rank
+        self.LR_mask_rank = LR_mask_rank
+
+    @classmethod
+    def from_dict(cls, json_object):
+        """Constructs a `BertConfig` from a Python dictionary of parameters."""
+        config = PlugNLUConfig()
+        for key, value in json_object.items():
+            config.__dict__[key] = value
+        return config
+
+    @classmethod
+    def from_json_file(cls, json_file):
+        """Constructs a `BertConfig` from a json file of parameters."""
+        with open(json_file, 'r', encoding='utf-8') as reader:
+            text = reader.read()
+        return cls.from_dict(json.loads(text))
+
+    def merge_args(self, args):
+        """merge values a `BertConfig` from a json file of parameters."""
+        local_keys = self.__dict__.keys()
+        for key, value in args.__dict__.items():
+            if key in local_keys:
+                continue
+            self.__dict__[key] = value
+        return self
+
+    def __repr__(self):
+        return str(self.to_json_string())
+
+    def to_dict(self):
+        """Serializes this instance to a Python dictionary."""
+        output = copy.deepcopy(self.__dict__)
+        return output
+
+    def to_json_string(self):
+        """Serializes this instance to a JSON string."""
+        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + '\n'
+
+
+class PlugNLGConfig(PlugNLUConfig):
+    model_type = 'plugNLG'
+
+    def __init__(self,
+                 vocab_size=21504,
+                 hidden_size=768,
+                 num_hidden_layers=12,
+                 num_attention_heads=12,
+                 intermediate_size=3072,
+                 hidden_act='gelu',
+                 hidden_dropout_prob=0.1,
+                 attention_probs_dropout_prob=0.1,
+                 max_position_embeddings=512,
+                 type_vocab_size=2,
+                 initializer_range=0.00707,
+                 deep_init=False,
+                 deepspeed=False,
+                 lr_decay_style='linear',
+                 weight_decay=1e-2,
+                 clip_grad=1.0,
+                 warmup=0.01,
+                 pre_ln=False,
+                 fp16=False,
+                 fp32_layernorm=False,
+                 fp32_embedding=False,
+                 fp32_tokentypes=False,
+                 layernorm_epsilon=1e-12,
+                 dec_hidden_layers=6,
+                 pruning_method=None,
+                 pruning_mask_init='constant',
+                 pruning_mask_scale=0.0,
+                 pruning_initial_threshold=1.0,
+                 pruning_final_threshold=0.01,
+                 pruning_initial_warmup=1,
+                 pruning_final_warmup=20,
+                 pruning_module='decoder',
+                 pruning_decay_step=50,
+                 pruning_decay_type='exp',
+                 ft_module=None,
+                 attn_separate=False,
+                 LR_weight_rank=8,
+                 LR_mask_rank=8,
+                 **kwargs):
+        super().__init__(layer_norm_eps=layernorm_epsilon, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.deep_init = deep_init
+        self.deepspeed = deepspeed
+        self.lr_decay_style = lr_decay_style
+        self.weight_decay = weight_decay
+        self.clip_grad = clip_grad
+        self.warmup = warmup
+        self.pre_ln = pre_ln
+        self.fp16 = fp16
+        self.fp32_layernorm = fp32_layernorm
+        self.fp32_embedding = fp32_embedding
+        self.layernorm_epsilon = layernorm_epsilon
+        self.fp32_tokentypes = fp32_tokentypes
+        self.dec_hidden_layers = dec_hidden_layers
+        self.pruning_method = pruning_method
+        self.pruning_mask_init = pruning_mask_init
+        self.pruning_mask_scale = pruning_mask_scale
+        self.pruning_module = pruning_module
+        self.pruning_initial_threshold = pruning_initial_threshold
+        self.pruning_final_threshold = pruning_final_threshold
+        self.pruning_initial_warmup = pruning_initial_warmup
+        self.pruning_final_warmup = pruning_final_warmup
+        self.pruning_decay_step = pruning_decay_step
+        self.pruning_decay_type = pruning_decay_type
+        self.ft_module = ft_module
+        self.attn_separate = attn_separate
+        self.LR_weight_rank = LR_weight_rank
+        self.LR_mask_rank = LR_mask_rank
diff --git a/modelscope/models/nlp/plug/distributed_plug.py b/modelscope/models/nlp/plug/distributed_plug.py
new file mode 100644
index 00000000..2992f595
--- /dev/null
+++ b/modelscope/models/nlp/plug/distributed_plug.py
@@ -0,0 +1,191 @@
+import os
+from typing import Dict
+
+import torch
+import torch.nn.functional as F
+from megatron import mpu
+from megatron.fp16 import FP16_Module
+from megatron.utils import print_rank_0
+
+from modelscope.models import TorchModel
+from modelscope.models.base import Tensor
+from modelscope.utils.logger import get_logger
+from modelscope.utils.nlp.distributed import initialize_distributed
+from modelscope.utils.nlp.load_checkpoint import pre_load
+from modelscope.utils.torch_utils import set_random_seed_mpu
+from . import PlugModel
+from .configuration_plug import PlugNLGConfig
+
+logger = get_logger(__name__)
+
+
+class DistributedPlug(TorchModel):
+
+    def __init__(self, model_dir, rank, **kwargs):
+        super().__init__(model_dir, **kwargs)
+        self.rank = rank
+        self.model_cfg = kwargs
+        self.config = PlugNLGConfig.from_pretrained(model_dir)
+        initialize_distributed(rank, mpu, kwargs['world_size'],
+                               kwargs['model_parallel_size'],
+                               kwargs['master_ip'], kwargs['master_port'])
+        seed = 0 if 'seed' not in kwargs else kwargs['seed']
+        set_random_seed_mpu(seed)
+        self.iteration = 0
+        self.dist_model = self.initialize_model(path_load_tag='model')
+
+    def initialize_model(self, path_load_tag='model'):
+        """Build the model."""
+        print_rank_0('Building Plug model. It will take a few minutes ...')
+        model = PlugModel(self.config)
+
+        if mpu.get_data_parallel_rank() == 0:
+            logger.info(
+                ' > number of parameters on model parallel rank {}: {}'.format(
+                    mpu.get_model_parallel_rank(),
+                    sum([p.nelement() for p in model.parameters()])))
+
+        if self.config.deepspeed and self.config.fp16:
+            model.half()
+
+        # GPU allocation.
+        model.cuda(torch.cuda.current_device())
+
+        # Fp16 conversion.
+        if self.config.fp16:
+            model = FP16_Module(model)
+            if self.config.fp32_embedding:
+                model.module.model.bert.embeddings.word_embeddings.float()
+                model.module.model.bert.embeddings.position_embeddings.float()
+                model.module.model.bert.embeddings.token_type_embeddings.float(
+                )
+            if self.config.fp32_tokentypes:
+                model.module.model.bert.embeddings.token_type_embeddings.float(
+                )
+            if self.config.fp32_layernorm:
+                for name, _module in model.named_modules():
+                    if 'LayerNorm' in name:
+                        _module.float()
+
+        load_model = pre_load(mpu, self.model_dir, tag=path_load_tag)
+        model_dict = model.module.model.state_dict()
+        for key in load_model:
+            if key not in model_dict.keys():
+                print_rank_0('Skip key: ' + key)
+            else:
+                print_rank_0('Loading key: ' + key)
+        model.module.model.load_state_dict(load_model, strict=False)
+        return model
+
+    @staticmethod
+    def top_k_logits(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
+        # This function has been mostly taken from huggingface conversational ai code at
+        # https://medium.com/huggingface/how-to-build-a-state-of-the-art-
+        # conversational-ai-with-transfer-learning-2d818ac26313
+
+        if top_k > 0:
+            # Remove all tokens with a probability less than the last token of the top-k
+            indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1,
+                                                                      None]
+            logits[indices_to_remove] = filter_value
+
+        if top_p > 0.0:
+            # convert to 1D
+            logits = logits.view(logits.size()[1]).contiguous()
+            sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+            cumulative_probs = torch.cumsum(
+                F.softmax(sorted_logits, dim=-1), dim=-1)
+
+            # Remove tokens with cumulative probability above the threshold
+            sorted_indices_to_remove = cumulative_probs > top_p
+            # Shift the indices to the right to keep also the first token above the threshold
+            sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
+                ..., :-1].clone()
+            sorted_indices_to_remove[..., 0] = 0
+            indices_to_remove = sorted_indices[sorted_indices_to_remove]
+            logits[indices_to_remove] = filter_value
+            # going back to 2D
+            logits = logits.view(1, -1).contiguous()
+        return logits
+
+    def generate(self, input: Dict[str, Tensor], out_length=128, *kwargs):
+        device = torch.cuda.current_device()
+        batch_size = input['input_ids'].shape[0]
+        tokens = input['input_ids'].view(1, -1).contiguous().to(device)
+        dec_input_ids = input['dec_input_ids'].to(device)
+        attention_mask = input['attention_mask'].to(device)
+        self.dist_model.eval()
+        with torch.no_grad():
+            # Only supports batch_size=1
+            all_generate_tokens = []
+            generate_tokens = []
+            counter = 0
+            sequence_output = None
+            vocab_size = self.config.original_vocab_size
+            sep_token_idx = 102  # index of [SEP] token in BertTokenizer
+            while counter < out_length:
+                if counter % 128 == 0 and counter != 0:
+                    # Sliding window
+                    generate_tokens.append(sep_token_idx)
+                    start = (tokens == sep_token_idx).nonzero(
+                        as_tuple=True)[-1]
+                    if start + len(generate_tokens) >= 512:
+                        tokens = torch.cat([
+                            tokens[:start],
+                            torch.cuda.LongTensor(generate_tokens)
+                        ], -1)[-512:]
+                    else:
+                        tokens[0][start:start + len(generate_tokens
+                                                    )] = torch.cuda.LongTensor(
+                                                        generate_tokens)
+
+                    attention_mask = (tokens != 0)
+                    dec_input_ids = input['dec_input_ids'].to(device)
+                    generate_tokens = []
+                    sequence_output = None
+
+                position_ids = torch.full([batch_size, 1],
+                                          len(generate_tokens),
+                                          dtype=torch.long,
+                                          device=device)
+                _, logits, sequence_output = self.dist_model(
+                    tokens,
+                    None,
+                    attention_mask,
+                    dec_input_ids,
+                    attention_mask,
+                    position_ids,
+                    is_infer=True,
+                    sequence_output=sequence_output,
+                    parallel_output=False)
+                logits = logits[:, -1, :]
+                logits = logits / self.model_cfg['temperature']
+                logits = self.top_k_logits(
+                    logits,
+                    top_k=self.model_cfg['top_k'],
+                    top_p=self.model_cfg['top_p'])
+                log_probs = F.softmax(logits, dim=-1)
+                prev = torch.multinomial(log_probs, num_samples=1)
+                prev_token = prev[0].item()
+                if prev_token >= vocab_size:
+                    prev_token = 100
+                    prev[0] = 100
+                if prev_token == 102 and len(all_generate_tokens) > int(
+                        max(1, out_length) * 0.8):
+                    break
+                if prev_token == 102:
+                    counter += 1
+                    continue
+                dec_input_ids = torch.cat([dec_input_ids, prev], dim=1)
+                generate_tokens.append(prev_token)
+                all_generate_tokens.append(prev_token)
+                counter += 1
+
+            generate_context = []
+            for token in all_generate_tokens:
+                if generate_context and generate_context[
+                        -1] == 100 and token == 100:
+                    continue
+                else:
+                    generate_context.append(token)
+            return {'generate_context': generate_context}
diff --git a/modelscope/models/nlp/plug/modeling_plug.py b/modelscope/models/nlp/plug/modeling_plug.py
new file mode 100644
index 00000000..9d2bb14f
--- /dev/null
+++ b/modelscope/models/nlp/plug/modeling_plug.py
@@ -0,0 +1,1054 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+import logging
+import math
+import os
+
+import torch
+import torch.nn.functional as F
+from deepspeed.utils.timer import SynchronizedWallClockTimer
+from megatron import mpu
+from torch import nn
+
+from modelscope.utils.nlp.distributed import (normal_init_method,
+                                              scaled_init_method)
+from .configuration_plug import PlugNLGConfig, PlugNLUConfig
+
+logger = logging.getLogger(__name__)
+
+
+def gelu(x):
+    """Implementation of the gelu activation function.
+        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
+        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+    """
+    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
+
+
+def swish(x):
+    return x * torch.sigmoid(x)
+
+
+ACT2FN = {'gelu': gelu, 'relu': torch.nn.functional.relu, 'swish': swish}
+
+
+class BertLayerNorm(nn.Module):
+
+    def __init__(self, hidden_size, eps=1e-12):
+        """Construct a layernorm module in the TF style (epsilon inside the square root).
+        """
+        super(BertLayerNorm, self).__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.bias = nn.Parameter(torch.zeros(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, x):
+        u = x.mean(-1, keepdim=True)
+        s = (x - u).pow(2).mean(-1, keepdim=True)
+        x = (x - u) / torch.sqrt(s + self.variance_epsilon)
+        return self.weight * x + self.bias
+
+
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings.
+    """
+
+    def __init__(self, config):
+        super(BertEmbeddings, self).__init__()
+        self.word_embeddings = mpu.VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            init_method=normal_init_method(
+                mean=0.0, std=config.initializer_range))
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
+                                                config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
+                                                  config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.fp32_layernorm = config.fp32_layernorm
+        self.fp32_embedding = config.fp32_embedding
+        self.fp32_tokentypes = config.fp32_tokentypes
+        self.LayerNorm = BertLayerNorm(
+            config.hidden_size, eps=config.layernorm_epsilon)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, input_ids, token_type_ids=None, position_ids=None):
+        seq_length = input_ids.size(1)
+        if position_ids is None:
+            position_ids = torch.arange(
+                seq_length, dtype=torch.long, device=input_ids.device)
+            position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+
+        words_embeddings = self.word_embeddings(input_ids)
+        position_embeddings = self.position_embeddings(position_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+        if not self.fp32_tokentypes:
+
+            embeddings = words_embeddings + position_embeddings + token_type_embeddings
+            if self.fp32_embedding and not self.fp32_layernorm:
+                embeddings = embeddings.half()
+            previous_type = embeddings.type()
+            if self.fp32_layernorm:
+                embeddings = embeddings.float()
+            embeddings = self.LayerNorm(embeddings)
+            if self.fp32_layernorm:
+                if self.fp32_embedding:
+                    embeddings = embeddings.half()
+                else:
+                    embeddings = embeddings.type(previous_type)
+        else:
+            embeddings = words_embeddings.float() + position_embeddings.float(
+            ) + token_type_embeddings.float()
+            if self.fp32_tokentypes and not self.fp32_layernorm:
+                embeddings = embeddings.half()
+            previous_type = embeddings.type()
+            if self.fp32_layernorm:
+                embeddings = embeddings.float()
+            embeddings = self.LayerNorm(embeddings)
+            if self.fp32_layernorm:
+                if self.fp32_tokentypes:
+                    embeddings = embeddings.half()
+                else:
+                    embeddings = embeddings.type(previous_type)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class BertSelfOutput(nn.Module):
+
+    def __init__(self, config):
+        super(BertSelfOutput, self).__init__()
+        if hasattr(config, 'deep_init') and config.deep_init:
+            init_method = scaled_init_method(
+                mean=0.0,
+                std=config.initializer_range,
+                num_layers=config.num_hidden_layers)
+        else:
+            init_method = normal_init_method(
+                mean=0.0, std=config.initializer_range)
+        self.dense = mpu.RowParallelLinear(
+            input_size=config.hidden_size,
+            output_size=config.hidden_size,
+            bias=True,
+            input_is_parallel=True,
+            stride=1,
+            init_method=init_method,
+            pruning_method=config.pruning_method if config.pruning_module in [
+                'all', 'encoder', 'encoder_self', 'encoder_selfvo',
+                'encoder_selfo'
+            ] else None,
+            pruning_mask_init=config.pruning_mask_init,
+            pruning_mask_scale=config.pruning_mask_scale,
+            LR_weight_rank=config.LR_weight_rank,
+            LR_mask_rank=config.LR_mask_rank)
+        self.fp32_layernorm = config.fp32_layernorm
+        if not config.pre_ln:
+            self.LayerNorm = BertLayerNorm(
+                config.hidden_size, eps=config.layernorm_epsilon)
+        else:
+            self.LayerNorm = None
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(
+        self,
+        hidden_states,
+        input_tensor,
+        pruning_threshold=None,
+    ):
+        hidden_states = self.dense(
+            hidden_states,
+            pruning_threshold=pruning_threshold,
+        )
+        hidden_states = self.dropout(hidden_states)
+        ln_input = hidden_states + input_tensor
+        if self.LayerNorm is not None:
+            previous_type = ln_input.type()
+            if self.fp32_layernorm:
+                ln_input = ln_input.float()
+            hidden_states = self.LayerNorm(ln_input)
+            if self.fp32_layernorm:
+                hidden_states = hidden_states.type(previous_type)
+        else:
+            hidden_states = ln_input
+        return hidden_states
+
+
+class BertAttention(nn.Module):
+
+    def __init__(self, config):
+        super(BertAttention, self).__init__()
+        self.fp32_layernorm = config.fp32_layernorm
+        if config.pre_ln:
+            self.LayerNorm = BertLayerNorm(
+                config.hidden_size, eps=config.layernorm_epsilon)
+        else:
+            self.LayerNorm = None
+        self.self = mpu.BertParallelSelfAttention(
+            hidden_size=config.hidden_size,
+            num_attention_heads=config.num_attention_heads,
+            dropout_prob=config.attention_probs_dropout_prob,
+            output_parallel=True,
+            init_method=normal_init_method(
+                mean=0.0, std=config.initializer_range),
+            separate=config.attn_separate,
+            pruning_method=config.pruning_method,
+            pruning_mask_init=config.pruning_mask_init,
+            pruning_mask_scale=config.pruning_mask_scale,
+            pruning_module=config.pruning_module,
+            LR_weight_rank=config.LR_weight_rank,
+            LR_mask_rank=config.LR_mask_rank)
+        self.output = BertSelfOutput(config)
+
+    def forward(
+        self,
+        input_tensor,
+        attention_mask,
+        pruning_threshold=None,
+    ):
+        if self.LayerNorm is not None:
+            ln_input = input_tensor
+            previous_type = input_tensor.type()
+            if self.fp32_layernorm:
+                ln_input = input_tensor.float()
+            ln_output = self.LayerNorm(ln_input)
+            if self.fp32_layernorm:
+                ln_output = ln_output.type(previous_type)
+            self_output = self.self(
+                ln_output,
+                attention_mask,
+                pruning_threshold=pruning_threshold,
+            )
+        else:
+            self_output = self.self(
+                input_tensor,
+                attention_mask,
+                pruning_threshold=pruning_threshold,
+            )
+        output_pruning_threshold = pruning_threshold
+
+        attention_output = self.output(
+            self_output,
+            input_tensor,
+            pruning_threshold=output_pruning_threshold,
+        )
+        return attention_output
+
+
+class BertIntermediate(nn.Module):
+
+    def __init__(self, config):
+        super(BertIntermediate, self).__init__()
+        self.dense = mpu.ColumnParallelLinear(
+            input_size=config.hidden_size,
+            output_size=config.intermediate_size,
+            bias=True,
+            gather_output=False,
+            stride=1,
+            init_method=normal_init_method(
+                mean=0.0, std=config.initializer_range),
+            pruning_method=config.pruning_method if config.pruning_module
+            in ['all', 'encoder', 'encoder_ffn'] else None,
+            pruning_mask_init=config.pruning_mask_init,
+            pruning_mask_scale=config.pruning_mask_scale,
+            LR_weight_rank=config.LR_weight_rank,
+            LR_mask_rank=config.LR_mask_rank)
+        self.intermediate_act_fn = ACT2FN[config.hidden_act] \
+            if isinstance(config.hidden_act, str) else config.hidden_act
+
+    def forward(
+        self,
+        hidden_states,
+        pruning_threshold=None,
+    ):
+        hidden_states = self.dense(
+            hidden_states,
+            pruning_threshold=pruning_threshold,
+        )
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class BertOutput(nn.Module):
+
+    def __init__(self, config):
+        super(BertOutput, self).__init__()
+        if hasattr(config, 'deep_init') and config.deep_init:
+            init_method = scaled_init_method(
+                mean=0.0,
+                std=config.initializer_range,
+                num_layers=config.num_hidden_layers)
+        else:
+            init_method = normal_init_method(
+                mean=0.0, std=config.initializer_range)
+        self.dense = mpu.RowParallelLinear(
+            input_size=config.intermediate_size,
+            output_size=config.hidden_size,
+            bias=True,
+            input_is_parallel=True,
+            stride=1,
+            init_method=init_method,
+            pruning_method=config.pruning_method if config.pruning_module
+            in ['all', 'encoder', 'encoder_ffn'] else None,
+            pruning_mask_init=config.pruning_mask_init,
+            pruning_mask_scale=config.pruning_mask_scale,
+            LR_weight_rank=config.LR_weight_rank,
+            LR_mask_rank=config.LR_mask_rank)
+        self.fp32_layernorm = config.fp32_layernorm
+        if not config.pre_ln:
+            self.LayerNorm = BertLayerNorm(
+                config.hidden_size, eps=config.layernorm_epsilon)
+        else:
+            self.LayerNorm = None
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(
+        self,
+        hidden_states,
+        input_tensor,
+        pruning_threshold=None,
+    ):
+        hidden_states = self.dense(
+            hidden_states,
+            pruning_threshold=pruning_threshold,
+        )
+        hidden_states = self.dropout(hidden_states)
+        ln_input = hidden_states + input_tensor
+        if self.LayerNorm is not None:
+            previous_type = ln_input.type()
+            if self.fp32_layernorm:
+                ln_input = ln_input.float()
+            hidden_states = self.LayerNorm(ln_input)
+            if self.fp32_layernorm:
+                hidden_states = hidden_states.type(previous_type)
+        else:
+            hidden_states = ln_input
+        return hidden_states
+
+
+class BertLayer(nn.Module):
+
+    def __init__(self, config):
+        super(BertLayer, self).__init__()
+        self.attention = BertAttention(config)
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+        self.fp32_layernorm = config.fp32_layernorm
+        if config.pre_ln:
+            self.LayerNorm = BertLayerNorm(
+                config.hidden_size, eps=config.layernorm_epsilon)
+        else:
+            self.LayerNorm = None
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        pruning_threshold=None,
+    ):
+        attention_output = self.attention(
+            hidden_states, attention_mask, pruning_threshold=pruning_threshold)
+        if self.LayerNorm is not None:
+            ln_input = attention_output
+            previous_type = attention_output.type()
+            if self.fp32_layernorm:
+                ln_input = attention_output.float()
+            ln_output = self.LayerNorm(ln_input)
+            if self.fp32_layernorm:
+                ln_output = ln_output.type(previous_type)
+            intermediate_output = self.intermediate(
+                ln_output, pruning_threshold=pruning_threshold)
+        else:
+            intermediate_output = self.intermediate(
+                attention_output, pruning_threshold=pruning_threshold)
+        layer_output = self.output(
+            intermediate_output,
+            attention_output,
+            pruning_threshold=pruning_threshold)
+        return layer_output
+
+
+class BertEncoder(nn.Module):
+
+    def __init__(self, config):
+        super(BertEncoder, self).__init__()
+        self.layer = nn.ModuleList(
+            [BertLayer(config) for _ in range(config.num_hidden_layers)])
+        self.fp32_layernorm = config.fp32_layernorm
+        if config.pre_ln:
+            self.LayerNorm = BertLayerNorm(
+                config.hidden_size, eps=config.layernorm_epsilon)
+        else:
+            self.LayerNorm = None
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        output_all_encoded_layers=True,
+        checkpoint_activations=False,
+        detach_index=-1,
+        pruning_threshold=None,
+    ):
+        all_encoder_layers = []
+
+        def custom(start, end):
+
+            def custom_forward(*inputs):
+                layers = self.layer[start:end]
+                x_ = inputs[0]
+                for layer in layers:
+                    x_ = layer(
+                        x_, inputs[1], pruning_threshold=pruning_threshold)
+                return x_
+
+            return custom_forward
+
+        if checkpoint_activations:
+            layer_idx = 0
+            num_layers = len(self.layer)
+            chunk_length = 1
+            while layer_idx < num_layers:
+                hidden_states = mpu.checkpoint(
+                    custom(layer_idx, layer_idx + chunk_length), hidden_states,
+                    attention_mask * 1)
+                if detach_index == layer_idx:
+                    hidden_states.detach_()
+                layer_idx += chunk_length
+            # decoder layers
+        else:
+            for i, layer_module in enumerate(self.layer):
+                hidden_states = layer_module(hidden_states, attention_mask)
+                if detach_index == i:
+                    hidden_states.detach_()
+                if i == len(self.layer) - 1 and self.LayerNorm is not None:
+                    previous_type = hidden_states.type()
+                    if self.fp32_layernorm:
+                        hidden_states = hidden_states.float()
+                    hidden_states = self.LayerNorm(hidden_states)
+                    if self.fp32_layernorm:
+                        hidden_states = hidden_states.type(previous_type)
+                if output_all_encoded_layers:
+                    all_encoder_layers.append(hidden_states)
+
+        if not output_all_encoded_layers or checkpoint_activations:
+            if self.LayerNorm is not None:
+                previous_type = hidden_states.type()
+                if self.fp32_layernorm:
+                    hidden_states = hidden_states.float()
+                hidden_states = self.LayerNorm(hidden_states)
+                if self.fp32_layernorm:
+                    hidden_states = hidden_states.type(previous_type)
+            all_encoder_layers.append(hidden_states)
+        return all_encoder_layers
+
+
+class BertPooler(nn.Module):
+
+    def __init__(self, config):
+        super(BertPooler, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class BertPredictionHeadTransform(nn.Module):
+
+    def __init__(self, config):
+        super(BertPredictionHeadTransform, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.transform_act_fn = ACT2FN[config.hidden_act] \
+            if isinstance(config.hidden_act, str) else config.hidden_act
+        self.LayerNorm = BertLayerNorm(
+            config.hidden_size, eps=config.layernorm_epsilon)
+        self.fp32_layernorm = config.fp32_layernorm
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        previous_type = hidden_states.type()
+        if self.fp32_layernorm:
+            hidden_states = hidden_states.float()
+        hidden_states = self.LayerNorm(hidden_states)
+        if self.fp32_layernorm:
+            hidden_states = hidden_states.type(previous_type)
+        return hidden_states
+
+
+class BertLMPredictionHead(nn.Module):
+
+    def __init__(self, config, bert_model_embedding_weights):
+        super(BertLMPredictionHead, self).__init__()
+        self.transform = BertPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder_weight = bert_model_embedding_weights
+        self.bias = nn.Parameter(
+            torch.zeros(bert_model_embedding_weights.size(0)))
+        self.bias.model_parallel = True
+        self.fp32_embedding = config.fp32_embedding
+        self.fp32_layernorm = config.fp32_layernorm
+
+        def convert_to_type(tensor):
+            if self.fp32_embedding:
+                return tensor.half()
+            else:
+                return tensor
+
+        self.type_converter = convert_to_type
+        self.converted = False
+        self.timers = SynchronizedWallClockTimer()
+
+    def forward(self, hidden_states):
+        if not self.converted:
+            self.converted = True
+            if self.fp32_embedding:
+                self.transform.half()
+                if self.fp32_layernorm:
+                    self.transform.LayerNorm.float()
+        hidden_states = self.transform(self.type_converter(hidden_states))
+        self.timers('final linear gather').start()
+        hidden_states = mpu.copy_to_model_parallel_region(hidden_states)
+        self.timers('final linear gather').stop()
+        hidden_states = F.linear(
+            self.type_converter(hidden_states),
+            self.type_converter(self.decoder_weight),
+            self.type_converter(self.bias))
+        return hidden_states
+
+
+class BertPreTrainingHeads(nn.Module):
+
+    def __init__(self, config, bert_model_embedding_weights):
+        super(BertPreTrainingHeads, self).__init__()
+        self.predictions = BertLMPredictionHead(config,
+                                                bert_model_embedding_weights)
+        self.seq_relationship = nn.Linear(config.hidden_size, 3)
+
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        for p in self.seq_relationship.parameters():
+            if p is None:
+                continue
+            pooled_output = pooled_output.type_as(p)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+
+class PreTrainedBertModel(nn.Module):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+
+    def __init__(self, config, *inputs, **kwargs):
+        super(PreTrainedBertModel, self).__init__()
+        if not isinstance(config, PlugNLUConfig) and not isinstance(
+                config, PlugNLGConfig):
+            raise ValueError(
+                'Parameter config in `{}(config)` should be an instance of class `BertConfig`. '
+                'To create a model from a Google pretrained model use '
+                '`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`'.format(
+                    self.__class__.__name__, self.__class__.__name__))
+        self.config = config
+
+    def init_bert_weights(self, module):
+        """ Initialize the weights.
+        """
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, BertLayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+
+class BertModel(PreTrainedBertModel):
+    """BERT model ("Bidirectional Embedding Representations from a Transformer").
+
+    Params:
+        config: a BertConfig class instance with the configuration to build a new model
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as
+            described below. Default: `True`.
+
+    Outputs: Tuple of (encoded_layers, pooled_output)
+        `encoded_layers`: controled by `output_all_encoded_layers` argument:
+            - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
+                of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each
+                encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
+            - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
+                to the last attention block of shape [batch_size, sequence_length, hidden_size],
+        `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
+            classifier pretrained on top of the hidden state associated to the first character of the
+            input (`CLF`) to train on the Next-Sentence task (see BERT's paper).
+
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+    config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+    model = modeling.BertModel(config=config)
+    all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+
+    def __init__(self, config):
+        super(BertModel, self).__init__(config)
+        self.embeddings = BertEmbeddings(config)
+        self.encoder = BertEncoder(config)
+        self.pooler = BertPooler(config)
+        self.apply(self.init_bert_weights)
+
+    def forward(
+        self,
+        input_ids,
+        token_type_ids=None,
+        attention_mask=None,
+        output_all_encoded_layers=True,
+        checkpoint_activations=False,
+        detach_index=-1,
+        pruning_threshold=None,
+    ):
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+
+        # We create a 3D attention mask from a 2D tensor mask.
+        # Sizes are [batch_size, 1, 1, to_seq_length]
+        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+        # this attention mask is more simple than the triangular masking of causal attention
+        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(
+            dtype=next(self.encoder.parameters()).dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+
+        embedding_output = self.embeddings(input_ids, token_type_ids)
+        encoded_layers = self.encoder(
+            embedding_output,
+            extended_attention_mask,
+            output_all_encoded_layers=output_all_encoded_layers,
+            checkpoint_activations=checkpoint_activations,
+            detach_index=detach_index,
+            pruning_threshold=pruning_threshold)
+        sequence_output = encoded_layers[-1]
+        for p in self.pooler.parameters():
+            if p is None:
+                continue
+            sequence_output = sequence_output.type_as(p)
+            break
+
+        pooled_output = sequence_output[:, 0]
+        if not output_all_encoded_layers or checkpoint_activations:
+            encoded_layers = encoded_layers[-1]
+        return encoded_layers, pooled_output
+
+
+class DecodeLayer(nn.Module):
+
+    def __init__(self, config):
+        super(DecodeLayer, self).__init__()
+        init_method = normal_init_method(
+            mean=0.0, std=config.initializer_range)
+        output_layer_init_method = scaled_init_method(
+            mean=0.0,
+            std=config.initializer_range,
+            num_layers=config.num_hidden_layers)
+
+        self_pruning_method = config.pruning_method
+        cross_pruning_method = config.pruning_method
+        ffn_pruning_method = config.pruning_method
+
+        if config.ft_module is not None:
+            if 'decoder_self' in config.ft_module:
+                self_pruning_method = 'finetune'
+            if 'decoder_cross' in config.ft_module:
+                cross_pruning_method = 'finetune'
+            if 'decoder_ffn' in config.ft_module:
+                ffn_pruning_method = 'finetune'
+
+        self.attention = mpu.GPT2ParallelSelfAttention(
+            hidden_size=config.hidden_size,
+            num_attention_heads=config.num_attention_heads,
+            attention_dropout_prob=config.attention_probs_dropout_prob,
+            output_dropout_prob=config.hidden_dropout_prob,
+            init_method=init_method,
+            output_layer_init_method=output_layer_init_method,
+            pruning_method=self_pruning_method if config.pruning_module in [
+                'all', 'decoder', 'decoder_self', 'decoder_self+ffn'
+            ] else None,
+            pruning_mask_init=config.pruning_mask_init,
+            pruning_mask_scale=config.pruning_mask_scale,
+            LR_weight_rank=config.LR_weight_rank,
+            LR_mask_rank=config.LR_mask_rank,
+        )
+
+        self.cross_attention = mpu.PalmParallelCrossAttention(
+            hidden_size=config.hidden_size,
+            num_attention_heads=config.num_attention_heads,
+            attention_dropout_prob=config.attention_probs_dropout_prob,
+            output_dropout_prob=config.hidden_dropout_prob,
+            init_method=init_method,
+            attn_separate=False,
+            output_layer_init_method=output_layer_init_method,
+            pruning_method=cross_pruning_method,
+            pruning_mask_init=config.pruning_mask_init,
+            pruning_mask_scale=config.pruning_mask_scale,
+            pruning_module=config.pruning_module,
+            LR_weight_rank=config.LR_weight_rank,
+            LR_mask_rank=config.LR_mask_rank,
+        )
+
+        self.input_layernorm = BertLayerNorm(
+            config.hidden_size, eps=config.layernorm_epsilon)
+        self.post_attention_layernorm = BertLayerNorm(
+            config.hidden_size, eps=config.layernorm_epsilon)
+        self.post_cross_attention_layernorm = BertLayerNorm(
+            config.hidden_size, eps=config.layernorm_epsilon)
+
+        self.intermediate = mpu.ColumnParallelLinear(
+            config.hidden_size,
+            config.intermediate_size,
+            gather_output=False,
+            init_method=init_method,
+            pruning_method=ffn_pruning_method if config.pruning_module
+            in ['all', 'decoder', 'decoder_ffn', 'decoder_self+ffn'] else None,
+            pruning_mask_init=config.pruning_mask_init,
+            pruning_mask_scale=config.pruning_mask_scale,
+            LR_weight_rank=config.LR_weight_rank,
+            LR_mask_rank=config.LR_mask_rank,
+        )
+        self.intermediate_act_fn = ACT2FN[config.hidden_act] \
+            if isinstance(config.hidden_act, str) else config.hidden_act
+        self.output = mpu.RowParallelLinear(
+            config.intermediate_size,
+            config.hidden_size,
+            input_is_parallel=True,
+            init_method=output_layer_init_method,
+            pruning_method=ffn_pruning_method if config.pruning_module
+            in ['all', 'decoder', 'decoder_ffn', 'decoder_self+ffn'] else None,
+            pruning_mask_init=config.pruning_mask_init,
+            pruning_mask_scale=config.pruning_mask_scale,
+            LR_weight_rank=config.LR_weight_rank,
+            LR_mask_rank=config.LR_mask_rank,
+        )
+
+        self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)
+        self.fp32_layernorm = config.fp32_layernorm
+
+        def convert_to_type(tensor):
+            if self.fp32_layernorm:
+                return tensor.float()
+            else:
+                return tensor
+
+        self.type_converter = convert_to_type
+
+    # def forward(self, hidden_states, enc_attn_mask, dec_attn_mask):
+    def forward(self,
+                hidden_states,
+                enc_hidden_states,
+                enc_attn_mask,
+                dec_attn_mask,
+                is_infer=False,
+                pruning_threshold=None):
+        residual = hidden_states
+        previous_type = hidden_states.type()
+        hidden_states = self.input_layernorm(
+            self.type_converter(hidden_states))
+        if self.fp32_layernorm:
+            hidden_states = hidden_states.type(previous_type)
+        hidden_states = self.attention(
+            hidden_states,
+            dec_attn_mask,
+            is_infer=is_infer,
+            pruning_threshold=pruning_threshold)
+
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(
+            self.type_converter(hidden_states))
+        if self.fp32_layernorm:
+            hidden_states = hidden_states.type(previous_type)
+        hidden_states = self.cross_attention(
+            hidden_states,
+            enc_hidden_states,
+            enc_attn_mask,
+            pruning_threshold=pruning_threshold)
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.post_cross_attention_layernorm(
+            self.type_converter(hidden_states))
+        if self.fp32_layernorm:
+            hidden_states = hidden_states.type(previous_type)
+        hidden_states = self.intermediate(
+            hidden_states, pruning_threshold=pruning_threshold)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+
+        hidden_states = self.output(
+            hidden_states, pruning_threshold=pruning_threshold)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = residual + hidden_states
+
+        return hidden_states
+
+
+class BertDecoder(nn.Module):
+
+    def __init__(self, config):
+        super(BertDecoder, self).__init__()
+        self.layer = nn.ModuleList(
+            [DecodeLayer(config) for _ in range(config.dec_hidden_layers)])
+
+        self.final_layernorm = BertLayerNorm(
+            config.hidden_size, eps=config.layernorm_epsilon)
+        self.fp32_layernorm = config.fp32_layernorm
+
+    def forward(self,
+                hidden_states,
+                enc_hidden_states,
+                enc_attn_mask,
+                dec_attn_mask,
+                checkpoint_activations=False,
+                output_all_encoded_layers=False,
+                is_infer=False,
+                pruning_threshold=None):
+
+        def custom(start, end):
+
+            def custom_forward(*inputs):
+                layers = self.layer[start:end]
+                x_ = inputs[0]
+                for layer in layers:
+                    x_ = layer(
+                        x_,
+                        inputs[1],
+                        inputs[2],
+                        dec_attn_mask * 1,
+                        is_infer=is_infer,
+                        pruning_threshold=pruning_threshold)
+                return x_
+
+            return custom_forward
+
+        pre_enc_hidden = enc_hidden_states.data
+        if checkpoint_activations:
+            layer_idx = 0
+            num_layers = len(self.layer)
+            chunk_length = 1
+            while layer_idx < num_layers:
+                hidden_states = mpu.checkpoint(
+                    custom(layer_idx, layer_idx + chunk_length), hidden_states,
+                    enc_hidden_states, enc_attn_mask * 1)
+                enc_hidden_states.data = pre_enc_hidden
+                layer_idx += chunk_length
+        else:
+            for i, layer_module in enumerate(self.layer):
+                hidden_states = layer_module(
+                    hidden_states,
+                    enc_hidden_states,
+                    enc_attn_mask,
+                    dec_attn_mask,
+                    is_infer=is_infer,
+                    pruning_threshold=pruning_threshold)
+
+        previous_type = hidden_states.type()
+        if self.fp32_layernorm:
+            hidden_states = hidden_states.float()
+        hidden_states = self.final_layernorm(hidden_states)
+        if self.fp32_layernorm:
+            hidden_states = hidden_states.type(previous_type)
+
+        return [hidden_states]
+
+
+class DecodeModel(PreTrainedBertModel):
+
+    def __init__(self, config):
+        super(DecodeModel, self).__init__(config)
+        self.decoder = BertDecoder(config)
+        self.apply(self.init_bert_weights)
+
+    def forward(self,
+                embeddings,
+                sequence_output,
+                decode_input_ids,
+                position_ids=None,
+                enc_attn_mask=None,
+                dec_attn_mask=None,
+                checkpoint_activations=False,
+                is_infer=False,
+                pruning_threshold=None):
+        extended_attention_mask = enc_attn_mask.unsqueeze(1).unsqueeze(2)
+        extended_attention_mask = extended_attention_mask.to(
+            dtype=next(self.decoder.parameters()).dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+
+        embedding_output = embeddings(decode_input_ids)
+        sequence_output = self.decoder(
+            embedding_output,
+            sequence_output,
+            extended_attention_mask,
+            dec_attn_mask,
+            checkpoint_activations=False,
+            is_infer=is_infer,
+            pruning_threshold=pruning_threshold)
+        return sequence_output[-1]
+
+
+class PalmForPreTraining(PreTrainedBertModel):
+
+    def __init__(self, config):
+        super(PalmForPreTraining, self).__init__(config)
+        self.bert = BertModel(config)
+        self.cls = BertPreTrainingHeads(
+            config, self.bert.embeddings.word_embeddings.weight)
+        self.decoder = DecodeModel(config)
+        self.apply(self.init_bert_weights)
+
+    def forward(self,
+                input_ids,
+                token_type_ids=None,
+                attention_mask=None,
+                decode_input_ids=None,
+                position_ids=None,
+                decode_attention_mask=None,
+                lm_labels=None,
+                checkpoint_activations=False,
+                is_infer=False,
+                sequence_output=None,
+                parallel_output=True,
+                pruning_threshold=None):
+        if sequence_output is None:
+            sequence_output, pooled_output = self.bert(
+                input_ids,
+                token_type_ids,
+                attention_mask,
+                output_all_encoded_layers=False,
+                checkpoint_activations=checkpoint_activations,
+                pruning_threshold=pruning_threshold)
+            prediction_scores, seq_relationship_score = self.cls(
+                sequence_output, pooled_output)
+        else:
+            prediction_scores = None
+            sequence_output = sequence_output.to(
+                dtype=next(self.decoder.parameters()).dtype)
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+        decode_output = self.decoder(
+            self.bert.embeddings,
+            sequence_output,
+            decode_input_ids,
+            position_ids,
+            attention_mask,
+            decode_attention_mask,
+            checkpoint_activations=checkpoint_activations,
+            is_infer=is_infer,
+            pruning_threshold=pruning_threshold)
+
+        transformer_output_parallel = mpu.copy_to_model_parallel_region(
+            decode_output)
+
+        logits_parallel = F.linear(transformer_output_parallel,
+                                   self.bert.embeddings.word_embeddings.weight)
+
+        if parallel_output:
+            return prediction_scores, logits_parallel
+        if is_infer:
+            return prediction_scores, mpu.gather_from_model_parallel_region(
+                logits_parallel), sequence_output
+        return prediction_scores, mpu.gather_from_model_parallel_region(
+            logits_parallel)
+
+
+class PlugModel(torch.nn.Module):
+
+    def __init__(self, config):
+        super(PlugModel, self).__init__()
+        self.config = config
+        self.model = PalmForPreTraining(self.config)
+
+    def forward(self,
+                input_tokens,
+                token_type_ids=None,
+                attention_mask=None,
+                target_tokens=None,
+                position_ids=None,
+                decode_attention_mask=None,
+                checkpoint_activations=False,
+                is_infer=False,
+                sequence_output=None,
+                parallel_output=True):
+        return self.model(
+            input_tokens,
+            token_type_ids,
+            attention_mask,
+            target_tokens,
+            position_ids,
+            decode_attention_mask,
+            checkpoint_activations=checkpoint_activations,
+            is_infer=is_infer,
+            sequence_output=sequence_output,
+            parallel_output=parallel_output)
+
+    def state_dict(self, destination=None, prefix='', keep_vars=False):
+        return self.model.state_dict(
+            destination=destination, prefix=prefix, keep_vars=keep_vars)
+
+    def load_state_dict(self, state_dict, strict=True):
+        return self.model.load_state_dict(state_dict, strict=strict)
diff --git a/modelscope/models/nlp/ponet/__init__.py b/modelscope/models/nlp/ponet/__init__.py
new file mode 100644
index 00000000..6d26b194
--- /dev/null
+++ b/modelscope/models/nlp/ponet/__init__.py
@@ -0,0 +1,41 @@
+# Copyright 2021-2022 The Alibaba DAMO Team Authors.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .configuration_ponet import PoNetConfig
+    from .modeling_ponet import (PoNetForMaskedLM, PoNetModel,
+                                 PoNetPreTrainedModel)
+    from .tokenization_ponet import PoNetTokenizer
+else:
+    _import_structure = {
+        'configuration_ponet': ['PoNetConfig'],
+        'modeling_ponet':
+        ['PoNetForMaskedLM', 'PoNetModel', 'PoNetPreTrainedModel'],
+        'tokenization_ponet': ['PoNetTokenizer'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/nlp/ponet/configuration_ponet.py b/modelscope/models/nlp/ponet/configuration_ponet.py
new file mode 100644
index 00000000..70294fc2
--- /dev/null
+++ b/modelscope/models/nlp/ponet/configuration_ponet.py
@@ -0,0 +1,117 @@
+# Copyright 2021-2022 The Alibaba DAMO Team Authors.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PoNet model configuration, mainly copied from :class:`~transformers.BertConfig` """
+from transformers import PretrainedConfig
+
+from modelscope.utils import logger as logging
+
+logger = logging.get_logger(__name__)
+
+
+class PoNetConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration
+    of a :class:`~modelscope.models.nlp.ponet.PoNetModel`.
+    It is used to instantiate a PoNet model according to the specified arguments.
+
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+
+
+    Args:
+        vocab_size (:obj:`int`, `optional`, defaults to 30522):
+            Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
+            :obj:`inputs_ids` passed when calling :class:`~transformers.BertModel` or
+            :class:`~transformers.TFBertModel`.
+        hidden_size (:obj:`int`, `optional`, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string,
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
+        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
+            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.BertModel` or
+            :class:`~transformers.TFBertModel`.
+        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (:obj:`str`, `optional`, defaults to :obj:`"absolute"`):
+            Type of position embedding. Choose one of :obj:`"absolute"`, :obj:`"relative_key"`,
+            :obj:`"relative_key_query"`. For positional embeddings use :obj:`"absolute"`. For more information on
+            :obj:`"relative_key"`, please refer to `Self-Attention with Relative Position Representations (Shaw et al.)
+            <https://arxiv.org/abs/1803.02155>`__. For more information on :obj:`"relative_key_query"`, please refer to
+            `Method 4` in `Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)
+            <https://arxiv.org/abs/2009.13658>`__.
+        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if ``config.is_decoder=True``.
+        classifier_dropout (:obj:`float`, `optional`):
+            The dropout ratio for the classification head.
+        clsgsepg (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not use a trick to make sure the segment and local information will not leak.
+    """
+    model_type = 'ponet'
+
+    def __init__(self,
+                 vocab_size=30522,
+                 hidden_size=768,
+                 num_hidden_layers=12,
+                 num_attention_heads=12,
+                 intermediate_size=3072,
+                 hidden_act='gelu',
+                 hidden_dropout_prob=0.1,
+                 attention_probs_dropout_prob=0.1,
+                 max_position_embeddings=512,
+                 type_vocab_size=2,
+                 initializer_range=0.02,
+                 layer_norm_eps=1e-12,
+                 pad_token_id=0,
+                 position_embedding_type='absolute',
+                 use_cache=True,
+                 classifier_dropout=None,
+                 clsgsepg=True,
+                 **kwargs):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.use_cache = use_cache
+        self.classifier_dropout = classifier_dropout
+        self.clsgsepg = clsgsepg
diff --git a/modelscope/models/nlp/ponet/modeling_ponet.py b/modelscope/models/nlp/ponet/modeling_ponet.py
new file mode 100644
index 00000000..f37954db
--- /dev/null
+++ b/modelscope/models/nlp/ponet/modeling_ponet.py
@@ -0,0 +1,1591 @@
+# Copyright 2021-2022 The Alibaba DAMO Team Authors.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch PoNet model. """
+
+import math
+from dataclasses import dataclass
+from distutils.version import LooseVersion
+from typing import Optional, Tuple
+
+import torch
+import torch.utils.checkpoint
+from packaging import version
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.activations import ACT2FN
+from transformers.file_utils import (ModelOutput, add_code_sample_docstrings,
+                                     add_start_docstrings,
+                                     add_start_docstrings_to_model_forward,
+                                     replace_return_docstrings)
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions, MaskedLMOutput,
+    SequenceClassifierOutput, TokenClassifierOutput)
+from transformers.modeling_utils import (PreTrainedModel,
+                                         apply_chunking_to_forward,
+                                         find_pruneable_heads_and_indices,
+                                         prune_linear_layer)
+from transformers.models.bert.modeling_bert import \
+    load_tf_weights_in_bert as load_tf_weights_in_ponet
+
+from modelscope.utils.logger import get_logger
+from .configuration_ponet import PoNetConfig
+
+logger = get_logger(__name__)
+
+is_pytorch_12plus = LooseVersion(torch.__version__) >= LooseVersion('1.12.0')
+
+_CHECKPOINT_FOR_DOC = 'ponet-base-uncased'
+_CONFIG_FOR_DOC = 'PoNetConfig'
+_TOKENIZER_FOR_DOC = 'PoNetTokenizer'
+
+CLS_ID = 101
+EOS_ID = 102
+
+
+def segment_max(src, index, dim=1):
+    if is_pytorch_12plus:
+        out = torch.zeros_like(src).scatter_reduce(
+            dim,
+            index[:, :, None].expand_as(src),
+            src,
+            reduce='amax',
+            include_self=False)
+    else:
+        dummy_scatter_index = index[:, :, None].expand_as(src)
+        min_value = src.min() - 1
+        dummpy_scatter_shape = (*src.shape[:-1], index.max() + 1,
+                                src.shape[-1])
+        dummy_scatter_index_expand = dummy_scatter_index.unsqueeze(-2).expand(
+            *dummpy_scatter_shape)
+        index_reconstruct_expand = torch.arange(
+            index.max() + 1,
+            device=src.device)[None, None, :,
+                               None].expand(*dummpy_scatter_shape)
+        src_expand = src.unsqueeze(-2).expand(*dummpy_scatter_shape)
+        out, _ = src_expand.masked_scatter(
+            dummy_scatter_index_expand != index_reconstruct_expand,
+            torch.full_like(src_expand, min_value.item())).max(dim=1)
+
+    dummy = index.unsqueeze(-1).expand(*index.shape[:2], out.size(-1))
+    return torch.gather(out, dim, dummy).to(dtype=src.dtype)
+
+
+def get_segment_index(input_ids, cls_id=CLS_ID, eos_id=EOS_ID):
+    mask = (input_ids == cls_id).to(
+        dtype=torch.long) + (input_ids == eos_id).to(dtype=torch.long)
+    mask = mask + torch.cat([torch.zeros_like(mask[:, 0:1]), mask[:, :-1]],
+                            dim=1)
+    return mask.cumsum(dim=1) - 1
+
+
+def get_token_type_mask(input_ids, cls_id=CLS_ID, eos_id=EOS_ID):
+    mask = (input_ids == cls_id) | (input_ids == eos_id)
+    return mask
+
+
+def get_win_max(hidden_states, kernel_size=3):
+    m = nn.MaxPool1d(kernel_size, stride=1, padding=kernel_size // 2)
+    out = m(hidden_states.permute(0, 2, 1)).permute(0, 2, 1)
+    return out
+
+
+class PoNetEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(
+            config.vocab_size,
+            config.hidden_size,
+            padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
+                                                config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
+                                                  config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.position_embedding_type = getattr(config,
+                                               'position_embedding_type',
+                                               'absolute')
+        self.register_buffer(
+            'position_ids',
+            torch.arange(config.max_position_embeddings).expand((1, -1)))
+        if version.parse(torch.__version__) > version.parse('1.6.0'):
+            self.register_buffer(
+                'token_type_ids',
+                torch.zeros(
+                    self.position_ids.size(),
+                    dtype=torch.long,
+                    device=self.position_ids.device),
+                persistent=False,
+            )
+
+    def forward(self,
+                input_ids=None,
+                token_type_ids=None,
+                position_ids=None,
+                inputs_embeds=None,
+                past_key_values_length=0):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:,
+                                             past_key_values_length:seq_length
+                                             + past_key_values_length]
+
+        if token_type_ids is None:
+            if hasattr(self, 'token_type_ids'):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(
+                    input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(
+                    input_shape,
+                    dtype=torch.long,
+                    device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == 'absolute':
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class PoNetSelfAttention(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.dense_local = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dense_segment = nn.Linear(config.hidden_size, config.hidden_size)
+
+        self.num_attention_heads = config.num_attention_heads
+        self.clsgsepg = getattr(config, 'clsgsepg', True)
+        self.attention_head_size = int(config.hidden_size
+                                       / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.dense_q = nn.Linear(config.hidden_size, self.all_head_size)
+        self.dense_k = nn.Linear(config.hidden_size, self.all_head_size)
+        self.dense_o = nn.Linear(config.hidden_size, self.all_head_size)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads,
+                                       self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)  # bz, head, len, head_size
+
+    def forward(
+        self,
+        hidden_states,
+        segment_index,
+        token_type_mask,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+
+        context_layer_q = self.transpose_for_scores(
+            self.dense_q(hidden_states))
+        context_layer_k = self.transpose_for_scores(
+            self.dense_k(hidden_states))
+        context_layer_v = context_layer_k
+        context_layer_o = self.transpose_for_scores(
+            self.dense_o(hidden_states))
+
+        if attention_mask is not None:
+            _attention_mask = (attention_mask.squeeze(1).unsqueeze(-1) < -1)
+
+        if attention_mask is not None:
+            context_layer_q.masked_fill_(_attention_mask, 0.0)
+            q = context_layer_q.sum(dim=-2) / torch.ones_like(
+                _attention_mask).to(dtype=context_layer_q.dtype).masked_fill(
+                    _attention_mask, 0.0).sum(dim=-2)
+        else:
+            q = context_layer_q.mean(dim=-2)
+        att = torch.einsum('bdh,bdlh -> bdl', q, context_layer_k) / math.sqrt(
+            context_layer_q.shape[-1])
+        if attention_mask is not None:
+            att = att + attention_mask.squeeze(1)
+        att_prob = att.softmax(dim=-1)
+        v = torch.einsum('bdlh,bdl->bdh', context_layer_v, att_prob)
+
+        context_layer_segment = self.dense_segment(hidden_states)
+        context_layer_local = self.dense_local(hidden_states)
+        if attention_mask is not None:
+            context_layer_local.masked_fill_(
+                _attention_mask.squeeze(1), -10000)
+            context_layer_segment.masked_fill_(
+                _attention_mask.squeeze(1), -10000)
+
+        if self.clsgsepg:
+            # XXX: a trick to make sure the segment and local information will not leak
+            context_layer_local = get_win_max(
+                context_layer_local.masked_fill(
+                    token_type_mask.unsqueeze(dim=-1), -10000))
+            context_layer_segment = segment_max(
+                context_layer_segment, index=segment_index)
+
+            context_layer_segment.masked_fill_(
+                token_type_mask.unsqueeze(dim=-1), 0.0)
+            context_layer_local.masked_fill_(
+                token_type_mask.unsqueeze(dim=-1), 0.0)
+        else:
+            context_layer_local = get_win_max(context_layer_local)
+            context_layer_segment = segment_max(
+                context_layer_segment, index=segment_index)
+
+        context_layer_local = self.transpose_for_scores(context_layer_local)
+        context_layer_segment = self.transpose_for_scores(
+            context_layer_segment)
+
+        context_layer = (v.unsqueeze(dim=-2) + context_layer_segment
+                         ) * context_layer_o + context_layer_local
+        context_layer = context_layer.permute(0, 2, 1, 3).reshape(
+            *hidden_states.shape[:2], -1)
+
+        if attention_mask is not None:
+            context_layer.masked_fill_(_attention_mask.squeeze(1), 0.0)
+
+        outputs = (context_layer,
+                   att_prob) if output_attentions else (context_layer, )
+        return outputs
+
+
+class PoNetSelfOutput(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class PoNetIntermediate(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class PoNetOutput(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class PoNetAttention(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.self = PoNetSelfAttention(config)
+        self.output = PoNetSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads,
+            self.self.attention_head_size, self.pruned_heads)
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(
+            heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states,
+        segment_index,
+        token_type_mask,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        self_outputs = self.self(
+            hidden_states,
+            segment_index,
+            token_type_mask,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,
+                   ) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class PoNetLayer(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = PoNetAttention(config)
+
+        config.is_decoder = False  # XXX: Decoder is not yet impletemented.
+        self.is_decoder = config.is_decoder
+
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            assert self.is_decoder, f'{self} should be used as a decoder model if cross attention is added'
+            self.crossattention = PoNetAttention(config)
+        self.intermediate = PoNetIntermediate(config)
+        self.output = PoNetOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        segment_index,
+        token_type_mask,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:
+                                                  2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            segment_index,
+            token_type_mask,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[
+                1:]  # add self attentions if we output attention weights
+
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            assert hasattr(
+                self, 'crossattention'
+            ), f'If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`'  # noqa *
+
+            cross_attn_past_key_value = past_key_value[
+                -2:] if past_key_value is not None else None
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                cross_attn_past_key_value,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[
+                1:-1]  # add cross attentions if we output attention weights
+
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        layer_output = apply_chunking_to_forward(self.feed_forward_chunk,
+                                                 self.chunk_size_feed_forward,
+                                                 self.seq_len_dim,
+                                                 attention_output)
+        outputs = (layer_output, ) + outputs
+
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value, )
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class PoNetEncoder(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList(
+            [PoNetLayer(config) for _ in range(config.num_hidden_layers)])
+
+    def forward(
+        self,
+        hidden_states,
+        segment_index,
+        token_type_mask,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = (
+        ) if output_attentions and self.config.add_cross_attention else None
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states, )
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[
+                i] if past_key_values is not None else None
+
+            if getattr(self.config, 'gradient_checkpointing',
+                       False) and self.training:
+
+                if use_cache:
+                    logger.warning(
+                        '`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting '
+                        '`use_cache=False`...')
+                    use_cache = False
+
+                def create_custom_forward(module):
+
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value,
+                                      output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    segment_index,
+                    token_type_mask,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    segment_index,
+                    token_type_mask,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1], )
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (
+                    layer_outputs[1], )
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (
+                        layer_outputs[2], )
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states, )
+
+        if not return_dict:
+            return tuple(v for v in [
+                hidden_states,
+                next_decoder_cache,
+                all_hidden_states,
+                all_self_attentions,
+                all_cross_attentions,
+            ] if v is not None)
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class PoNetPooler(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class PoNetPredictionHeadTransform(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class PoNetLMPredictionHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.transform = PoNetPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(
+            config.hidden_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+class PoNetOnlyMLMHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = PoNetLMPredictionHead(config)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class PoNetPreTrainingHeads(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = PoNetLMPredictionHead(config)
+        self.seq_relationship = nn.Linear(config.hidden_size, 3)
+
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+
+class PoNetPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = PoNetConfig
+    load_tf_weights = load_tf_weights_in_ponet
+    base_model_prefix = 'ponet'
+    _keys_to_ignore_on_load_missing = [r'position_ids']
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+@dataclass
+class PoNetForPreTrainingOutput(ModelOutput):
+    """
+    Output type of :class:`~transformers.PoNetForPreTraining`.
+
+    Args:
+        loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
+            Total loss as the sum of the masked language modeling loss and the next sequence prediction
+            (classification) loss.
+        mlm_loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
+            Masked language modeling loss.
+        sop_loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
+            sop loss.
+        prediction_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        seq_relationship_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
+            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
+            before SoftMax).
+        hidden_states
+            (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed
+            or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed
+            or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    mlm_loss: Optional[torch.FloatTensor] = None
+    sop_loss: Optional[torch.FloatTensor] = None
+    prediction_logits: torch.FloatTensor = None
+    seq_relationship_logits: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+PONET_START_DOCSTRING = r"""
+
+    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+
+    Parameters:
+        config (:class:`~modelscope.models.nlp.ponet.PoNetConfig`):
+            Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
+"""
+
+PONET_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~modelscope.models.nlp.ponet.PoNetTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            `What are attention masks? <../glossary.html#attention-mask>`__
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
+            1]``:
+
+            - 0 corresponds to a `sentence A` token,
+            - 1 corresponds to a `sentence B` token.
+
+            `What are token type IDs? <../glossary.html#token-type-ids>`_
+        position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+            config.max_position_embeddings - 1]``.
+
+            `What are position IDs? <../glossary.html#position-ids>`_
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    'The bare PoNet Model transformer outputting raw hidden-states without any specific head on top.',
+    PONET_START_DOCSTRING,
+)
+class PoNetModel(PoNetPreTrainedModel):
+    """
+
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in `Attention is
+    all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the :obj:`is_decoder` argument of the configuration
+    set to :obj:`True`. To be used in a Seq2Seq model, the model needs to initialized with both :obj:`is_decoder`
+    argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
+    input to the forward pass.
+    """
+
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = PoNetEmbeddings(config)
+        self.encoder = PoNetEncoder(config)
+
+        self.pooler = PoNetPooler(config) if add_pooling_layer else None
+
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(
+        PONET_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPoolingAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        segment_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        encoder_hidden_states
+            (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers`
+            with each tuple having 4 tensors of shape :obj:
+            `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else
+            self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                'You cannot specify both input_ids and inputs_embeds at the same time'
+            )
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            batch_size, seq_length = input_shape
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            batch_size, seq_length = input_shape
+        else:
+            raise ValueError(
+                'You have to specify either input_ids or inputs_embeds')
+
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[
+            2] if past_key_values is not None else 0
+
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                ((batch_size, seq_length + past_key_values_length)),
+                device=device)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(
+                input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(
+            attention_mask, input_shape, device)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size(
+            )
+            encoder_hidden_shape = (encoder_batch_size,
+                                    encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(
+                    encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(
+                encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask,
+                                       self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+
+        segment_index = get_segment_index(
+            input_ids) if segment_ids is None else segment_ids
+        token_type_mask = get_token_type_mask(input_ids)
+        encoder_outputs = self.encoder(
+            embedding_output,
+            segment_index,
+            token_type_mask,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(
+            sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    PoNet Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next
+    sentence prediction (classification)` head.
+    """,
+    PONET_START_DOCSTRING,
+)
+class PoNetForPreTraining(PoNetPreTrainedModel):
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.ponet = PoNetModel(config)
+        self.cls = PoNetPreTrainingHeads(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(
+        PONET_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @replace_return_docstrings(
+        output_type=PoNetForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        segment_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        next_sentence_label=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape ``(batch_size, sequence_length)``, `optional`):
+            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+        next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`):
+            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
+            (see :obj:`input_ids` docstring) Indices should be in ``[0, 1]``:
+
+            - 0 indicates sequence B is a continuation of sequence A,
+            - 1 indicates sequence B is a random sequence.
+        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
+            Used to hide legacy arguments that have been deprecated.
+
+        Returns:
+
+        Example::
+
+            >>> from transformers import PoNetTokenizer, PoNetForPreTraining
+            >>> import torch
+
+            >>> tokenizer = PoNetTokenizer.from_pretrained('ponet-base-uncased')
+            >>> model = PoNetForPreTraining.from_pretrained('ponet-base-uncased')
+
+            >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+            >>> outputs = model(**inputs)
+
+            >>> prediction_logits = outputs.prediction_logits
+            >>> seq_relationship_logits = outputs.seq_relationship_logits
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.ponet(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            segment_ids=segment_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output, pooled_output = outputs[:2]
+        prediction_scores, seq_relationship_score = self.cls(
+            sequence_output, pooled_output)
+
+        total_loss = None
+        masked_lm_loss = None
+        next_sentence_loss = None
+        if labels is not None and next_sentence_label is not None:
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(
+                prediction_scores.view(-1, self.config.vocab_size),
+                labels.view(-1))
+            next_sentence_loss = loss_fct(
+                seq_relationship_score.view(-1, 3),
+                next_sentence_label.view(-1))
+            total_loss = masked_lm_loss + next_sentence_loss
+
+        if not return_dict:
+            output = (prediction_scores, seq_relationship_score) + outputs[2:]
+            return ((total_loss, masked_lm_loss, next_sentence_loss)
+                    + output) if total_loss is not None else output
+
+        return PoNetForPreTrainingOutput(
+            loss=total_loss,
+            mlm_loss=masked_lm_loss,
+            sop_loss=next_sentence_loss,
+            prediction_logits=prediction_scores,
+            seq_relationship_logits=seq_relationship_score,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """PoNet Model with a `language modeling` head on top for CLM fine-tuning. """,
+    PONET_START_DOCSTRING)
+class PoNetLMHeadModel(PoNetPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+    _keys_to_ignore_on_load_missing = [
+        r'position_ids', r'predictions.decoder.bias'
+    ]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        if not config.is_decoder:
+            logger.warning(
+                'If you want to use `PoNetLMHeadModel` as a standalone, add `is_decoder=True.`'
+            )
+
+        self.ponet = PoNetModel(config, add_pooling_layer=False)
+        self.cls = PoNetOnlyMLMHead(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(
+        PONET_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @replace_return_docstrings(
+        output_type=CausalLMOutputWithCrossAttentions,
+        config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        segment_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:
+            `(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
+            ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]``
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers`
+            with each tuple having 4 tensors of shape :
+            obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+
+        Returns:
+
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+
+        outputs = self.ponet(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            segment_ids=segment_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        lm_loss = None
+        if labels is not None:
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shifted_prediction_scores = prediction_scores[:, :
+                                                          -1, :].contiguous()
+            labels = labels[:, 1:].contiguous()
+            loss_fct = CrossEntropyLoss()
+            lm_loss = loss_fct(
+                shifted_prediction_scores.view(-1, self.config.vocab_size),
+                labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores, ) + outputs[2:]
+            return ((lm_loss, ) + output) if lm_loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def prepare_inputs_for_generation(self,
+                                      input_ids,
+                                      past=None,
+                                      attention_mask=None,
+                                      **model_kwargs):
+        input_shape = input_ids.shape
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_shape)
+
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {
+            'input_ids': input_ids,
+            'attention_mask': attention_mask,
+            'past_key_values': past
+        }
+
+    def _reorder_cache(self, past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            reordered_past += (tuple(
+                past_state.index_select(0, beam_idx)
+                for past_state in layer_past), )
+        return reordered_past
+
+
+@add_start_docstrings(
+    """PoNet Model with a `language modeling` head on top. """,
+    PONET_START_DOCSTRING)
+class PoNetForMaskedLM(PoNetPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+    _keys_to_ignore_on_load_missing = [
+        r'position_ids', r'predictions.decoder.bias'
+    ]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        if config.is_decoder:
+            logger.warning(
+                'If you want to use `PoNetForMaskedLM` make sure `config.is_decoder=False` for '
+                'bi-directional self-attention.')
+
+        self.ponet = PoNetModel(config, add_pooling_layer=False)
+        self.cls = PoNetOnlyMLMHead(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(
+        PONET_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MaskedLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        segment_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.ponet(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            segment_ids=segment_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(
+                prediction_scores.view(-1, self.config.vocab_size),
+                labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores, ) + outputs[2:]
+            return ((masked_lm_loss, )
+                    + output) if masked_lm_loss is not None else output
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    PoNet Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
+    output) e.g. for GLUE tasks.
+    """,
+    PONET_START_DOCSTRING,
+)
+class PoNetForSequenceClassification(PoNetPreTrainedModel):
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+
+        self.ponet = PoNetModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.init_weights()
+
+    @add_start_docstrings_to_model_forward(
+        PONET_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        segment_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
+            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.ponet(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            segment_ids=segment_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = 'regression'
+                elif self.num_labels > 1 and (labels.dtype == torch.long
+                                              or labels.dtype == torch.int):
+                    self.config.problem_type = 'single_label_classification'
+                else:
+                    self.config.problem_type = 'multi_label_classification'
+
+            if self.config.problem_type == 'regression':
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == 'single_label_classification':
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(
+                    logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == 'multi_label_classification':
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits, ) + outputs[2:]
+            return ((loss, ) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    PoNet Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    PONET_START_DOCSTRING,
+)
+class PoNetForTokenClassification(PoNetPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.ponet = PoNetModel(config, add_pooling_layer=False)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.init_weights()
+
+    @add_start_docstrings_to_model_forward(
+        PONET_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        segment_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
+            1]``.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.ponet(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            segment_ids=segment_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # Only keep active parts of the loss
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)
+                active_labels = torch.where(
+                    active_loss, labels.view(-1),
+                    torch.tensor(loss_fct.ignore_index).type_as(labels))
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(
+                    logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits, ) + outputs[2:]
+            return ((loss, ) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/modelscope/models/nlp/ponet/tokenization_ponet.py b/modelscope/models/nlp/ponet/tokenization_ponet.py
new file mode 100644
index 00000000..21544886
--- /dev/null
+++ b/modelscope/models/nlp/ponet/tokenization_ponet.py
@@ -0,0 +1,155 @@
+# Copyright 2021-2022 The Alibaba DAMO Team Authors.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for PoNet """
+
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
+
+from transformers.file_utils import PaddingStrategy
+from transformers.models.bert.tokenization_bert import BertTokenizer
+
+from modelscope.utils.constant import ModelFile
+from modelscope.utils.logger import get_logger
+
+logger = get_logger(__name__)
+
+VOCAB_FILES_NAMES = {'vocab_file': ModelFile.VOCAB_FILE}
+
+PRETRAINED_VOCAB_FILES_MAP = {'vocab_file': {}}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    'nlp_ponet_fill-mask_chinese-base': 512,
+    'nlp_ponet_fill-mask_english-base': 512,
+}
+
+PRETRAINED_INIT_CONFIGURATION = {
+    'nlp_ponet_fill-mask_chinese-base': {
+        'do_lower_case': True
+    },
+    'nlp_ponet_fill-mask_english-base': {
+        'do_lower_case': True
+    },
+}
+
+
+class PoNetTokenizer(BertTokenizer):
+    r"""
+    Construct an PoNet tokenizer. Based on BertTokenizer.
+
+    This tokenizer inherits from :class:`~transformers.BertTokenizer` which contains most of the main methods.
+    Users should refer to this superclass for more information regarding those methods.
+
+    Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
+    parameters.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+
+    def _pad(
+        self,
+        encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
+        max_length: Optional[int] = None,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        pad_to_multiple_of: Optional[int] = None,
+        return_attention_mask: Optional[bool] = None,
+    ) -> dict:
+        """
+        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
+
+        Args:
+            encoded_inputs: Dictionary of tokenized inputs (`List[int]`) or
+            batch of tokenized inputs (`List[List[int]]`).
+            max_length: maximum length of the returned list and optionally padding length (see below).
+                Will truncate by taking into account the special tokens.
+            padding_strategy: PaddingStrategy to use for padding.
+
+                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
+                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
+                - PaddingStrategy.DO_NOT_PAD: Do not pad
+                The tokenizer padding sides are defined in self.padding_side:
+
+                    - 'left': pads on the left of the sequences
+                    - 'right': pads on the right of the sequences
+            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
+                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
+                >= 7.5 (Volta).
+            return_attention_mask: (optional) Set to False to avoid returning
+            attention mask (default: set to model specifics)
+        """
+        # Load from model defaults
+        if return_attention_mask is None:
+            return_attention_mask = 'attention_mask' in self.model_input_names
+
+        required_input = encoded_inputs[self.model_input_names[0]]
+
+        if padding_strategy == PaddingStrategy.LONGEST:
+            max_length = len(required_input)
+
+        if max_length is not None and pad_to_multiple_of is not None and (
+                max_length % pad_to_multiple_of != 0):
+            max_length = (
+                (max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+
+        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(
+            required_input) != max_length
+
+        if needs_to_be_padded:
+            difference = max_length - len(required_input)
+            if self.padding_side == 'right':
+                if return_attention_mask:
+                    encoded_inputs['attention_mask'] = [1] * len(
+                        required_input) + [0] * difference
+                if 'token_type_ids' in encoded_inputs:
+                    encoded_inputs['token_type_ids'] = (
+                        encoded_inputs['token_type_ids']
+                        + [self.pad_token_type_id] * difference)
+                if 'special_tokens_mask' in encoded_inputs:
+                    encoded_inputs['special_tokens_mask'] = encoded_inputs[
+                        'special_tokens_mask'] + [1] * difference
+                if 'segment_ids' in encoded_inputs:
+                    encoded_inputs[
+                        'segment_ids'] = encoded_inputs['segment_ids'] + [
+                            encoded_inputs['segment_ids'][-1] + 1
+                        ] * difference  # noqa *
+                encoded_inputs[self.model_input_names[
+                    0]] = required_input + [self.pad_token_id] * difference
+            elif self.padding_side == 'left':
+                if return_attention_mask:
+                    encoded_inputs['attention_mask'] = [0] * difference + [
+                        1
+                    ] * len(required_input)
+                if 'token_type_ids' in encoded_inputs:
+                    encoded_inputs['token_type_ids'] = [
+                        self.pad_token_type_id
+                    ] * difference + encoded_inputs['token_type_ids']
+                if 'segment_ids' in encoded_inputs:
+                    encoded_inputs['segment_ids'] = [encoded_inputs['segment_ids'][-1] + 1] * difference + \
+                                                    encoded_inputs['segment_ids']  # noqa *
+                if 'special_tokens_mask' in encoded_inputs:
+                    encoded_inputs['special_tokens_mask'] = [
+                        1
+                    ] * difference + encoded_inputs['special_tokens_mask']
+                encoded_inputs[self.model_input_names[
+                    0]] = [self.pad_token_id] * difference + required_input
+            else:
+                raise ValueError('Invalid padding strategy:'
+                                 + str(self.padding_side))
+        elif return_attention_mask and 'attention_mask' not in encoded_inputs:
+            encoded_inputs['attention_mask'] = [1] * len(required_input)
+
+        return encoded_inputs
diff --git a/modelscope/models/nlp/ponet_for_masked_language.py b/modelscope/models/nlp/ponet_for_masked_language.py
new file mode 100644
index 00000000..11f4bc11
--- /dev/null
+++ b/modelscope/models/nlp/ponet_for_masked_language.py
@@ -0,0 +1,53 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Any, Dict
+
+from modelscope.metainfo import Models
+from modelscope.models.base import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.models.nlp.ponet import \
+    PoNetForMaskedLM as PoNetForMaskedLMTransformer
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import Tasks
+
+__all__ = ['PoNetForMaskedLM']
+
+
+@MODELS.register_module(Tasks.fill_mask, module_name=Models.ponet)
+class PoNetForMaskedLM(TorchModel, PoNetForMaskedLMTransformer):
+    """PoNet for MLM model.'.
+
+    Inherited from ponet.PoNetForMaskedLM and TorchModel, so this class can be registered into Model sets.
+    """
+
+    def __init__(self, config, model_dir):
+        super(TorchModel, self).__init__(model_dir)
+        PoNetForMaskedLMTransformer.__init__(self, config)
+
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                token_type_ids=None,
+                segment_ids=None,
+                position_ids=None,
+                head_mask=None,
+                labels=None):
+        output = PoNetForMaskedLMTransformer.forward(
+            self,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            segment_ids=segment_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            labels=labels)
+        output[OutputKeys.INPUT_IDS] = input_ids
+        return output
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        model_dir = kwargs.get('model_dir')
+        return super(PoNetForMaskedLMTransformer,
+                     PoNetForMaskedLM).from_pretrained(
+                         pretrained_model_name_or_path=model_dir,
+                         model_dir=model_dir)
diff --git a/modelscope/models/nlp/sentence_embedding.py b/modelscope/models/nlp/sentence_embedding.py
new file mode 100644
index 00000000..955c0e53
--- /dev/null
+++ b/modelscope/models/nlp/sentence_embedding.py
@@ -0,0 +1,74 @@
+import os
+from typing import Any, Dict
+
+import json
+import numpy as np
+
+from modelscope.metainfo import Models
+from modelscope.models import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.models.nlp.structbert import SbertPreTrainedModel
+from modelscope.utils.constant import Tasks
+
+__all__ = ['SentenceEmbedding']
+
+
+@MODELS.register_module(Tasks.sentence_embedding, module_name=Models.bert)
+class SentenceEmbedding(TorchModel, SbertPreTrainedModel):
+    base_model_prefix: str = 'bert'
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [r'position_ids']
+
+    def __init__(self, config, model_dir):
+        super().__init__(model_dir)
+        self.config = config
+        setattr(self, self.base_model_prefix, self.build_base_model())
+
+    def build_base_model(self):
+        from .structbert import SbertModel
+        return SbertModel(self.config, add_pooling_layer=False)
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]:
+        """return the result by the model
+
+        Args:
+            input (Dict[str, Any]): the preprocessed data
+
+        Returns:
+            Dict[str, np.ndarray]: results
+                Example:
+                    {
+                        'predictions': array([1]), # lable 0-negative 1-positive
+                        'probabilities': array([[0.11491239, 0.8850876 ]], dtype=float32),
+                        'logits': array([[-0.53860897,  1.5029076 ]], dtype=float32) # true value
+                    }
+        """
+        return self.base_model(**input)
+
+    def postprocess(self, inputs: Dict[str, np.ndarray],
+                    **kwargs) -> Dict[str, np.ndarray]:
+        embs = inputs['last_hidden_state'][:, 0].cpu().numpy()
+        num_sent = embs.shape[0]
+        if num_sent >= 2:
+            scores = np.dot(embs[0:1, ], np.transpose(embs[1:, ],
+                                                      (1, 0))).tolist()[0]
+        else:
+            scores = []
+        result = {'text_embedding': embs, 'scores': scores}
+
+        return result
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        """Instantiate the model.
+
+        @param kwargs: Input args.
+                    model_dir: The model dir used to load the checkpoint and the label information.
+        @return: The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
+        """
+        model_args = {}
+
+        return super(SbertPreTrainedModel, SentenceEmbedding).from_pretrained(
+            pretrained_model_name_or_path=kwargs.get('model_dir'),
+            model_dir=kwargs.get('model_dir'),
+            **model_args)
diff --git a/modelscope/models/nlp/space/model/__init__.py b/modelscope/models/nlp/space/model/__init__.py
index 24641f06..bb1d18e4 100644
--- a/modelscope/models/nlp/space/model/__init__.py
+++ b/modelscope/models/nlp/space/model/__init__.py
@@ -1,6 +1,6 @@
 from .configuration_space import SpaceConfig
 from .gen_unified_transformer import GenUnifiedTransformer
-from .generator import Generator as SpaceGenerator
+from .generator import SpaceGenerator
 from .intent_unified_transformer import IntentUnifiedTransformer
 from .model_base import SpaceModelBase
 from .modeling_space import (SpaceForDST, SpaceForMaskedLM,
diff --git a/modelscope/models/nlp/space/model/generator.py b/modelscope/models/nlp/space/model/generator.py
index c1521e3d..0e7833e6 100644
--- a/modelscope/models/nlp/space/model/generator.py
+++ b/modelscope/models/nlp/space/model/generator.py
@@ -38,24 +38,24 @@ def gather(var, idx):
         return var
 
 
-class Generator(object):
+class SpaceGenerator(object):
     """ Genrator class. """
 
     _registry = dict()
 
     @classmethod
     def register(cls, name):
-        Generator._registry[name] = cls
+        SpaceGenerator._registry[name] = cls
         return
 
     @staticmethod
     def by_name(name):
-        return Generator._registry[name]
+        return SpaceGenerator._registry[name]
 
     @staticmethod
     def create(config, *args, **kwargs):
         """ Create generator. """
-        generator_cls = Generator.by_name(config.Generator.generator)
+        generator_cls = SpaceGenerator.by_name(config.Generator.generator)
         return generator_cls(config, *args, **kwargs)
 
     def __init__(self, config, reader):
@@ -83,7 +83,7 @@ class Generator(object):
         raise NotImplementedError
 
 
-class BeamSearch(Generator):
+class BeamSearch(SpaceGenerator):
     """ BeamSearch generator. """
 
     def __init__(self, config, reader):
diff --git a/modelscope/models/nlp/space/space_for_dialog_modeling.py b/modelscope/models/nlp/space/space_for_dialog_modeling.py
index 4c65c7d1..efa9b851 100644
--- a/modelscope/models/nlp/space/space_for_dialog_modeling.py
+++ b/modelscope/models/nlp/space/space_for_dialog_modeling.py
@@ -41,7 +41,7 @@ class SpaceForDialogModeling(TorchModel):
 
         self.text_field = kwargs.pop(
             'text_field',
-            MultiWOZBPETextField(self.model_dir, config=self.config))
+            MultiWOZBPETextField(config=self.config, model_dir=self.model_dir))
         self.generator = SpaceGenerator.create(
             self.config, reader=self.text_field)
         self.model = SpaceModelBase.create(
diff --git a/modelscope/models/nlp/star3/__init__.py b/modelscope/models/nlp/star3/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/nlp/star3/configuration_star3.py b/modelscope/models/nlp/star3/configuration_star3.py
new file mode 100644
index 00000000..4c5ae677
--- /dev/null
+++ b/modelscope/models/nlp/star3/configuration_star3.py
@@ -0,0 +1,115 @@
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+# Copyright 2021-2022 The Alibaba DAMO Team Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch BERT configuration."""
+
+from __future__ import absolute_import, division, print_function
+import copy
+import logging
+
+import json
+
+logger = logging.getLogger(__name__)
+
+
+class Star3Config(object):
+    """Configuration class to store the configuration of a `Star3Model`.
+    """
+
+    def __init__(self,
+                 vocab_size_or_config_json_file,
+                 hidden_size=768,
+                 num_hidden_layers=12,
+                 num_attention_heads=12,
+                 intermediate_size=3072,
+                 hidden_act='gelu',
+                 hidden_dropout_prob=0.1,
+                 attention_probs_dropout_prob=0.1,
+                 max_position_embeddings=512,
+                 type_vocab_size=2,
+                 initializer_range=0.02):
+        """Constructs Star3Config.
+
+        Args:
+            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `Star3Model`.
+            hidden_size: Size of the encoder layers and the pooler layer.
+            num_hidden_layers: Number of hidden layers in the Transformer encoder.
+            num_attention_heads: Number of attention heads for each attention layer in
+                the Transformer encoder.
+            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
+                layer in the Transformer encoder.
+            hidden_act: The non-linear activation function (function or string) in the
+                encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
+            hidden_dropout_prob: The dropout probabilitiy for all fully connected
+                layers in the embeddings, encoder, and pooler.
+            attention_probs_dropout_prob: The dropout ratio for the attention
+                probabilities.
+            max_position_embeddings: The maximum sequence length that this model might
+                ever be used with. Typically set this to something large just in case
+                (e.g., 512 or 1024 or 2048).
+            type_vocab_size: The vocabulary size of the `token_type_ids` passed into `Star3Model`.
+            initializer_range: The sttdev of the truncated_normal_initializer for
+                initializing all weight matrices.
+        """
+        if isinstance(vocab_size_or_config_json_file, str):
+            with open(
+                    vocab_size_or_config_json_file, 'r',
+                    encoding='utf-8') as reader:
+                json_config = json.loads(reader.read())
+            for key, value in json_config.items():
+                self.__dict__[key] = value
+        elif isinstance(vocab_size_or_config_json_file, int):
+            self.vocab_size = vocab_size_or_config_json_file
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.hidden_act = hidden_act
+            self.intermediate_size = intermediate_size
+            self.hidden_dropout_prob = hidden_dropout_prob
+            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+            self.max_position_embeddings = max_position_embeddings
+            self.type_vocab_size = type_vocab_size
+            self.initializer_range = initializer_range
+        else:
+            raise ValueError(
+                'First argument must be either a vocabulary size (int)'
+                'or the path to a pretrained model config file (str)')
+
+    @classmethod
+    def from_dict(cls, json_object):
+        """Constructs a `Star3Config` from a Python dictionary of parameters."""
+        config = Star3Config(vocab_size_or_config_json_file=-1)
+        for key, value in json_object.items():
+            config.__dict__[key] = value
+        return config
+
+    @classmethod
+    def from_json_file(cls, json_file):
+        """Constructs a `Star3Config` from a json file of parameters."""
+        with open(json_file, 'r', encoding='utf-8') as reader:
+            text = reader.read()
+        return cls.from_dict(json.loads(text))
+
+    def __repr__(self):
+        return str(self.to_json_string())
+
+    def to_dict(self):
+        """Serializes this instance to a Python dictionary."""
+        output = copy.deepcopy(self.__dict__)
+        return output
+
+    def to_json_string(self):
+        """Serializes this instance to a JSON string."""
+        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + '\n'
diff --git a/modelscope/models/nlp/star3/modeling_star3.py b/modelscope/models/nlp/star3/modeling_star3.py
new file mode 100644
index 00000000..13f7136a
--- /dev/null
+++ b/modelscope/models/nlp/star3/modeling_star3.py
@@ -0,0 +1,1001 @@
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+# Copyright 2021-2022 The Alibaba DAMO Team Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch BERT model."""
+
+from __future__ import absolute_import, division, print_function
+import copy
+import math
+import os
+import shutil
+import tarfile
+import tempfile
+
+import numpy as np
+import torch
+from torch import nn
+
+from modelscope.models.nlp.star3.configuration_star3 import Star3Config
+from modelscope.utils.constant import ModelFile
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+CONFIG_NAME = ModelFile.CONFIGURATION
+WEIGHTS_NAME = ModelFile.TORCH_MODEL_BIN_FILE
+
+
+def gelu(x):
+    """Implementation of the gelu activation function.
+        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
+        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+    """
+    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
+
+
+def swish(x):
+    return x * torch.sigmoid(x)
+
+
+ACT2FN = {'gelu': gelu, 'relu': torch.nn.functional.relu, 'swish': swish}
+
+
+class BertLayerNorm(nn.Module):
+
+    def __init__(self, hidden_size, eps=1e-12):
+        """Construct a layernorm module in the TF style (epsilon inside the square root).
+        """
+        super(BertLayerNorm, self).__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.bias = nn.Parameter(torch.zeros(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, x):
+        u = x.mean(-1, keepdim=True)
+        s = (x - u).pow(2).mean(-1, keepdim=True)
+        x = (x - u) / torch.sqrt(s + self.variance_epsilon)
+        return self.weight * x + self.bias
+
+
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings.
+    """
+
+    def __init__(self, config):
+        super(BertEmbeddings, self).__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size,
+                                            config.hidden_size)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
+                                                config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
+                                                  config.hidden_size)
+        self.match_type_embeddings = nn.Embedding(11, config.hidden_size)
+        self.type_embeddings = nn.Embedding(6, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self,
+                input_ids,
+                header_ids,
+                token_type_ids=None,
+                match_type_ids=None,
+                l_hs=None,
+                header_len=None,
+                type_idx=None,
+                col_dict_list=None,
+                ids=None,
+                header_flatten_tokens=None,
+                header_flatten_index=None,
+                header_flatten_output=None,
+                token_column_id=None,
+                token_column_mask=None,
+                column_start_index=None,
+                headers_length=None):
+        seq_length = input_ids.size(1)
+        position_ids = torch.arange(
+            seq_length, dtype=torch.long, device=input_ids.device)
+        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+        words_embeddings = self.word_embeddings(input_ids)
+        header_embeddings = self.word_embeddings(header_ids)
+
+        if col_dict_list is not None and l_hs is not None:
+            col_dict_list = np.array(col_dict_list)[ids.cpu().numpy()].tolist()
+            header_len = np.array(
+                header_len, dtype=object)[ids.cpu().numpy()].tolist()
+            for bi, col_dict in enumerate(col_dict_list):
+                for ki, vi in col_dict.items():
+                    length = header_len[bi][vi]
+                    if length == 0:
+                        continue
+                    words_embeddings[bi, ki, :] = torch.mean(
+                        header_embeddings[bi, vi, :length, :], dim=0)
+
+        position_embeddings = self.position_embeddings(position_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+        embeddings = words_embeddings + position_embeddings + token_type_embeddings
+
+        if match_type_ids is not None:
+            match_type_embeddings = self.match_type_embeddings(match_type_ids)
+            embeddings += match_type_embeddings
+
+        if type_idx is not None:
+            type_embeddings = self.type_embeddings(type_idx)
+            embeddings += type_embeddings
+
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+class BertSelfAttention(nn.Module):
+
+    def __init__(self, config):
+        super(BertSelfAttention, self).__init__()
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                'The hidden size (%d) is not a multiple of the number of attention '
+                'heads (%d)' %
+                (config.hidden_size, config.num_attention_heads))
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size
+                                       / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads,
+                                       self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(self, hidden_states, attention_mask, schema_link_matrix=None):
+        mixed_query_layer = self.query(hidden_states)
+        mixed_key_layer = self.key(hidden_states)
+        mixed_value_layer = self.value(hidden_states)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        key_layer = self.transpose_for_scores(mixed_key_layer)
+        value_layer = self.transpose_for_scores(mixed_value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer,
+                                        key_layer.transpose(-1, -2))
+        attention_scores = attention_scores / math.sqrt(
+            self.attention_head_size)
+        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+        attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (
+            self.all_head_size, )
+        context_layer = context_layer.view(*new_context_layer_shape)
+        return context_layer
+
+
+class BertSelfAttentionWithRelationsRAT(nn.Module):
+    '''
+    Adapted from https://github.com/microsoft/rat-sql/blob/master/ratsql/models/transformer.py
+    '''
+
+    def __init__(self, config):
+        super(BertSelfAttentionWithRelationsRAT, self).__init__()
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                'The hidden size (%d) is not a multiple of the number of attention '
+                'heads (%d)' %
+                (config.hidden_size, config.num_attention_heads))
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size
+                                       / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+        self.relation_k_emb = nn.Embedding(
+            7, config.hidden_size // config.num_attention_heads)
+        self.relation_v_emb = nn.Embedding(
+            7, config.hidden_size // config.num_attention_heads)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads,
+                                       self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(self, hidden_states, attention_mask, relation):
+        '''
+        relation is [batch, seq len, seq len]
+        '''
+        mixed_query_layer = self.query(
+            hidden_states)  # [batch, seq len, hidden dim]
+        mixed_key_layer = self.key(hidden_states)
+        mixed_value_layer = self.value(hidden_states)
+
+        relation_k = self.relation_k_emb(
+            relation)  # [batch, seq len, seq len, head dim]
+        relation_v = self.relation_v_emb(
+            relation)  # [batch, seq len, seq len, head dim]
+
+        query_layer = self.transpose_for_scores(
+            mixed_query_layer)  # [batch, num attn heads, seq len, head dim]
+        key_layer = self.transpose_for_scores(
+            mixed_key_layer)  # [batch, num attn heads, seq len, head dim]
+        value_layer = self.transpose_for_scores(
+            mixed_value_layer)  # [batch, num attn heads, seq len, head dim]
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(
+            -1, -2))  # [batch, num attn heads, seq len, seq len]
+
+        # relation_k_t is [batch, seq len, head dim, seq len]
+        relation_k_t = relation_k.transpose(-2, -1)
+        # query_layer_t is [batch, seq len, num attn heads, head dim]
+        query_layer_t = query_layer.permute(0, 2, 1, 3)
+        # relation_attention_scores is [batch, seq len, num attn heads, seq len]
+        relation_attention_scores = torch.matmul(query_layer_t, relation_k_t)
+        # relation_attention_scores_t is [batch, num attn heads, seq len, seq len]
+        relation_attention_scores_t = relation_attention_scores.permute(
+            0, 2, 1, 3)
+
+        merged_attention_scores = (attention_scores
+                                   + relation_attention_scores_t) / math.sqrt(
+                                       self.attention_head_size)
+
+        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+        merged_attention_scores = merged_attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(merged_attention_scores)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        # attention_probs is [batch, num attn heads, seq len, seq len]
+        attention_probs = self.dropout(attention_probs)
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        # attention_probs_t is [batch, seq len, num attn heads, seq len]
+        attention_probs_t = attention_probs.permute(0, 2, 1, 3)
+
+        #   [batch, seq len, num attn heads, seq len]
+        # * [batch, seq len, seq len, head dim]
+        # = [batch, seq len, num attn heads, head dim]
+        context_relation = torch.matmul(attention_probs_t, relation_v)
+
+        # context_relation_t is [batch, num attn heads, seq len, head dim]
+        context_relation_t = context_relation.permute(0, 2, 1, 3)
+
+        merged_context_layer = context_layer + context_relation_t
+        merged_context_layer = merged_context_layer.permute(0, 2, 1,
+                                                            3).contiguous()
+        new_context_layer_shape = merged_context_layer.size()[:-2] + (
+            self.all_head_size, )
+        merged_context_layer = merged_context_layer.view(
+            *new_context_layer_shape)
+        return merged_context_layer
+
+
+class BertSelfAttentionWithRelationsTableformer(nn.Module):
+
+    def __init__(self, config):
+        super(BertSelfAttentionWithRelationsTableformer, self).__init__()
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                'The hidden size (%d) is not a multiple of the number of attention '
+                'heads (%d)' %
+                (config.hidden_size, config.num_attention_heads))
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size
+                                       / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.schema_link_embeddings = nn.Embedding(7, self.num_attention_heads)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads,
+                                       self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(self, hidden_states, attention_mask, relation):
+        '''
+        relation is [batch, seq len, seq len]
+        '''
+        mixed_query_layer = self.query(
+            hidden_states)  # [batch, seq len, hidden dim]
+        mixed_key_layer = self.key(hidden_states)
+        mixed_value_layer = self.value(hidden_states)
+
+        schema_link_embeddings = self.schema_link_embeddings(
+            relation)  # [batch, seq len, seq len, 1]
+        schema_link_embeddings = schema_link_embeddings.permute(0, 3, 1, 2)
+
+        query_layer = self.transpose_for_scores(
+            mixed_query_layer)  # [batch, num attn heads, seq len, head dim]
+        key_layer = self.transpose_for_scores(
+            mixed_key_layer)  # [batch, num attn heads, seq len, head dim]
+        value_layer = self.transpose_for_scores(
+            mixed_value_layer)  # [batch, num attn heads, seq len, head dim]
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(
+            -1, -2))  # [batch, num attn heads, seq len, seq len]
+        attention_scores = attention_scores / math.sqrt(
+            self.attention_head_size)
+
+        merged_attention_scores = attention_scores + schema_link_embeddings
+
+        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+        merged_attention_scores = merged_attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(merged_attention_scores)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        # attention_probs is [batch, num attn heads, seq len, seq len]
+        attention_probs = self.dropout(attention_probs)
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (
+            self.all_head_size, )
+        context_layer = context_layer.view(*new_context_layer_shape)
+        return context_layer
+
+
+class BertSelfOutput(nn.Module):
+
+    def __init__(self, config):
+        super(BertSelfOutput, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertAttention(nn.Module):
+
+    def __init__(self, config, schema_link_module='none'):
+        super(BertAttention, self).__init__()
+        if schema_link_module == 'none':
+            self.self = BertSelfAttention(config)
+        if schema_link_module == 'rat':
+            self.self = BertSelfAttentionWithRelationsRAT(config)
+        if schema_link_module == 'add':
+            self.self = BertSelfAttentionWithRelationsTableformer(config)
+        self.output = BertSelfOutput(config)
+
+    def forward(self, input_tensor, attention_mask, schema_link_matrix=None):
+        self_output = self.self(input_tensor, attention_mask,
+                                schema_link_matrix)
+        attention_output = self.output(self_output, input_tensor)
+        return attention_output
+
+
+class BertIntermediate(nn.Module):
+
+    def __init__(self, config):
+        super(BertIntermediate, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.intermediate_act_fn = ACT2FN[config.hidden_act] \
+            if isinstance(config.hidden_act, str) else config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class BertOutput(nn.Module):
+
+    def __init__(self, config):
+        super(BertOutput, self).__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertLayer(nn.Module):
+
+    def __init__(self, config, schema_link_module='none'):
+        super(BertLayer, self).__init__()
+        self.attention = BertAttention(
+            config, schema_link_module=schema_link_module)
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+
+    def forward(self, hidden_states, attention_mask, schema_link_matrix=None):
+        attention_output = self.attention(hidden_states, attention_mask,
+                                          schema_link_matrix)
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class SqlBertEncoder(nn.Module):
+
+    def __init__(self, layers, config):
+        super(SqlBertEncoder, self).__init__()
+        layer = BertLayer(config)
+        self.layer = nn.ModuleList(
+            [copy.deepcopy(layer) for _ in range(layers)])
+
+    def forward(self,
+                hidden_states,
+                attention_mask,
+                output_all_encoded_layers=True):
+        all_encoder_layers = []
+        for layer_module in self.layer:
+            hidden_states = layer_module(hidden_states, attention_mask)
+            if output_all_encoded_layers:
+                all_encoder_layers.append(hidden_states)
+        if not output_all_encoded_layers:
+            all_encoder_layers.append(hidden_states)
+        return all_encoder_layers
+
+
+class BertEncoder(nn.Module):
+
+    def __init__(self, config, schema_link_module='none'):
+        super(BertEncoder, self).__init__()
+        layer = BertLayer(config, schema_link_module=schema_link_module)
+        self.layer = nn.ModuleList(
+            [copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
+
+    def forward(self,
+                hidden_states,
+                attention_mask,
+                all_schema_link_matrix=None,
+                all_schema_link_mask=None,
+                output_all_encoded_layers=True):
+        all_encoder_layers = []
+        for layer_module in self.layer:
+            hidden_states = layer_module(hidden_states, attention_mask,
+                                         all_schema_link_matrix)
+            if output_all_encoded_layers:
+                all_encoder_layers.append(hidden_states)
+        if not output_all_encoded_layers:
+            all_encoder_layers.append(hidden_states)
+        return all_encoder_layers
+
+
+class BertPooler(nn.Module):
+
+    def __init__(self, config):
+        super(BertPooler, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class BertPredictionHeadTransform(nn.Module):
+
+    def __init__(self, config):
+        super(BertPredictionHeadTransform, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.transform_act_fn = ACT2FN[config.hidden_act] \
+            if isinstance(config.hidden_act, str) else config.hidden_act
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class BertLMPredictionHead(nn.Module):
+
+    def __init__(self, config, bert_model_embedding_weights):
+        super(BertLMPredictionHead, self).__init__()
+        self.transform = BertPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(
+            bert_model_embedding_weights.size(1),
+            bert_model_embedding_weights.size(0),
+            bias=False)
+        self.decoder.weight = bert_model_embedding_weights
+        self.bias = nn.Parameter(
+            torch.zeros(bert_model_embedding_weights.size(0)))
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states) + self.bias
+        return hidden_states
+
+
+class BertOnlyMLMHead(nn.Module):
+
+    def __init__(self, config, bert_model_embedding_weights):
+        super(BertOnlyMLMHead, self).__init__()
+        self.predictions = BertLMPredictionHead(config,
+                                                bert_model_embedding_weights)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class BertOnlyNSPHead(nn.Module):
+
+    def __init__(self, config):
+        super(BertOnlyNSPHead, self).__init__()
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, pooled_output):
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return seq_relationship_score
+
+
+class BertPreTrainingHeads(nn.Module):
+
+    def __init__(self, config, bert_model_embedding_weights):
+        super(BertPreTrainingHeads, self).__init__()
+        self.predictions = BertLMPredictionHead(config,
+                                                bert_model_embedding_weights)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+
+class PreTrainedBertModel(nn.Module):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+
+    def __init__(self, config, *inputs, **kwargs):
+        super(PreTrainedBertModel, self).__init__()
+        if not isinstance(config, Star3Config):
+            raise ValueError(
+                'Parameter config in `{}(config)` should be an instance of class `Star3Config`. '
+                'To create a model from a Google pretrained model use '
+                '`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`'.format(
+                    self.__class__.__name__, self.__class__.__name__))
+        self.config = config
+
+    def init_bert_weights(self, module):
+        """ Initialize the weights.
+        """
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, BertLayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+    @classmethod
+    def from_pretrained(cls,
+                        pretrained_model_name,
+                        state_dict=None,
+                        cache_dir=None,
+                        *inputs,
+                        **kwargs):
+        """
+        Instantiate a PreTrainedBertModel from a pre-trained model file or a pytorch state dict.
+        Download and cache the pre-trained model file if needed.
+
+        Params:
+            pretrained_model_name: either:
+                - a str with the name of a pre-trained model to load selected in the list of:
+                    . `bert-base-uncased`
+                    . `bert-large-uncased`
+                    . `bert-base-cased`
+                    . `bert-large-cased`
+                    . `bert-base-multilingual-uncased`
+                    . `bert-base-multilingual-cased`
+                    . `bert-base-chinese`
+                - a path or url to a pretrained model archive containing:
+                    . `bert_config.json` a configuration file for the model
+                    . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance
+            cache_dir: an optional path to a folder in which the pre-trained models will be cached.
+            state_dict: an optional state dictionnary (collections.OrderedDict object)
+                to use instead of Google pre-trained models
+            *inputs, **kwargs: additional input for the specific Bert class
+                (ex: num_labels for BertForSequenceClassification)
+        """
+        resolved_archive_file = pretrained_model_name
+        # redirect to the cache, if necessary
+        tempdir = None
+        if os.path.isdir(resolved_archive_file):
+            serialization_dir = resolved_archive_file
+        else:
+            # Extract archive to temp dir
+            tempdir = tempfile.mkdtemp()
+            logger.info('extracting archive file {} to temp dir {}'.format(
+                resolved_archive_file, tempdir))
+            with tarfile.open(resolved_archive_file, 'r:gz') as archive:
+                archive.extractall(tempdir)
+            serialization_dir = tempdir
+        # Load config
+        config_file = os.path.join(serialization_dir, CONFIG_NAME)
+        config = Star3Config.from_json_file(config_file)
+        logger.info('Model config {}'.format(config))
+        # Instantiate model.
+        model = cls(config, *inputs, **kwargs)
+        if state_dict is None:
+            weights_path = os.path.join(serialization_dir, WEIGHTS_NAME)
+            state_dict = torch.load(weights_path)
+
+        old_keys = []
+        new_keys = []
+        for key in state_dict.keys():
+            new_key = None
+            if 'gamma' in key:
+                new_key = key.replace('gamma', 'weight')
+            if 'beta' in key:
+                new_key = key.replace('beta', 'bias')
+            if new_key:
+                old_keys.append(key)
+                new_keys.append(new_key)
+        for old_key, new_key in zip(old_keys, new_keys):
+            state_dict[new_key] = state_dict.pop(old_key)
+
+        missing_keys = []
+        unexpected_keys = []
+        error_msgs = []
+        # copy state_dict so _load_from_state_dict can modify it
+        metadata = getattr(state_dict, '_metadata', None)
+        state_dict = state_dict.copy()
+        if metadata is not None:
+            state_dict._metadata = metadata
+
+        def load(module, prefix=''):
+            local_metadata = {} if metadata is None else metadata.get(
+                prefix[:-1], {})
+            module._load_from_state_dict(state_dict, prefix, local_metadata,
+                                         True, missing_keys, unexpected_keys,
+                                         error_msgs)
+            for name, child in module._modules.items():
+                if child is not None:
+                    load(child, prefix + name + '.')
+
+        load(model, prefix='' if hasattr(model, 'bert') else 'bert.')
+        if len(missing_keys) > 0:
+            logger.info(
+                'Weights of {} not initialized from pretrained model: {}'.
+                format(model.__class__.__name__, missing_keys))
+            print()
+            print('*' * 10, 'WARNING missing weights', '*' * 10)
+            print('Weights of {} not initialized from pretrained model: {}'.
+                  format(model.__class__.__name__, missing_keys))
+            print()
+        if len(unexpected_keys) > 0:
+            logger.info(
+                'Weights from pretrained model not used in {}: {}'.format(
+                    model.__class__.__name__, unexpected_keys))
+            print()
+            print('*' * 10, 'WARNING unexpected weights', '*' * 10)
+            print('Weights from pretrained model not used in {}: {}'.format(
+                model.__class__.__name__, unexpected_keys))
+            print()
+        if tempdir:
+            # Clean up temp dir
+            shutil.rmtree(tempdir)
+        return model
+
+
+class Star3Model(PreTrainedBertModel):
+    """Star3Model model ("Bidirectional Embedding Representations from a Transformer pretrained on STAR3.0").
+
+    Params:
+        config: a Star3Config class instance with the configuration to build a new model
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output
+            as described below. Default: `True`.
+
+    Outputs: Tuple of (encoded_layers, pooled_output)
+        `encoded_layers`: controled by `output_all_encoded_layers` argument:
+            - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
+                of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each
+                encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
+            - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
+                to the last attention block of shape [batch_size, sequence_length, hidden_size],
+        `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
+            classifier pretrained on top of the hidden state associated to the first character of the
+            input (`CLF`) to train on the Next-Sentence task (see BERT's paper).
+
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+    config = modeling.Star3Config(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+    model = modeling.Star3Model(config=config)
+    all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+
+    def __init__(self, config, schema_link_module='none'):
+        super(Star3Model, self).__init__(config)
+        self.embeddings = BertEmbeddings(config)
+        self.encoder = BertEncoder(
+            config, schema_link_module=schema_link_module)
+        self.pooler = BertPooler(config)
+        self.apply(self.init_bert_weights)
+
+    def forward(self,
+                input_ids,
+                header_ids,
+                token_order_ids=None,
+                token_type_ids=None,
+                attention_mask=None,
+                match_type_ids=None,
+                l_hs=None,
+                header_len=None,
+                type_ids=None,
+                col_dict_list=None,
+                ids=None,
+                header_flatten_tokens=None,
+                header_flatten_index=None,
+                header_flatten_output=None,
+                token_column_id=None,
+                token_column_mask=None,
+                column_start_index=None,
+                headers_length=None,
+                all_schema_link_matrix=None,
+                all_schema_link_mask=None,
+                output_all_encoded_layers=True):
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+
+        # We create a 3D attention mask from a 2D tensor mask.
+        # Sizes are [batch_size, 1, 1, to_seq_length]
+        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+        # this attention mask is more simple than the triangular masking of causal attention
+        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+
+        # Bowen: comment out the following line for Pytorch >= 1.5
+        # https://github.com/huggingface/transformers/issues/3936#issuecomment-793764416
+        # extended_attention_mask = extended_attention_mask.to(self.dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+
+        embedding_output = self.embeddings(
+            input_ids, header_ids, token_type_ids, match_type_ids, l_hs,
+            header_len, type_ids, col_dict_list, ids, header_flatten_tokens,
+            header_flatten_index, header_flatten_output, token_column_id,
+            token_column_mask, column_start_index, headers_length)
+        encoded_layers = self.encoder(
+            embedding_output,
+            extended_attention_mask,
+            all_schema_link_matrix=all_schema_link_matrix,
+            all_schema_link_mask=all_schema_link_mask,
+            output_all_encoded_layers=output_all_encoded_layers)
+        sequence_output = encoded_layers[-1]
+        pooled_output = self.pooler(sequence_output)
+        if not output_all_encoded_layers:
+            encoded_layers = encoded_layers[-1]
+        return encoded_layers, pooled_output
+
+
+class Seq2SQL(nn.Module):
+
+    def __init__(self, iS, hS, lS, dr, n_cond_ops, n_agg_ops, n_action_ops,
+                 max_select_num, max_where_num, device):
+        super(Seq2SQL, self).__init__()
+        self.iS = iS
+        self.hS = hS
+        self.ls = lS
+        self.dr = dr
+        self.device = device
+
+        self.n_agg_ops = n_agg_ops
+        self.n_cond_ops = n_cond_ops
+        self.n_action_ops = n_action_ops
+        self.max_select_num = max_select_num
+        self.max_where_num = max_where_num
+
+        self.w_sss_model = nn.Linear(iS, max_where_num)
+        self.w_sse_model = nn.Linear(iS, max_where_num)
+        self.s_ht_model = nn.Linear(iS, max_select_num)
+        self.wc_ht_model = nn.Linear(iS, max_where_num)
+
+        self.select_agg_model = nn.Linear(iS * max_select_num,
+                                          n_agg_ops * max_select_num)
+        self.w_op_model = nn.Linear(iS * max_where_num,
+                                    n_cond_ops * max_where_num)
+
+        self.conn_model = nn.Linear(iS, 3)
+        self.action_model = nn.Linear(iS, n_action_ops + 1)
+        self.slen_model = nn.Linear(iS, max_select_num + 1)
+        self.wlen_model = nn.Linear(iS, max_where_num + 1)
+
+    def forward(self, wemb_layer, l_n, l_hs, start_index, column_index, tokens,
+                ids):
+        # chunk input lists for multi-gpu
+        max_l_n = max(l_n)
+        max_l_hs = max(l_hs)
+        l_n = np.array(l_n)[ids.cpu().numpy()].tolist()
+        l_hs = np.array(l_hs)[ids.cpu().numpy()].tolist()
+        start_index = np.array(start_index)[ids.cpu().numpy()].tolist()
+        column_index = np.array(column_index)[ids.cpu().numpy()].tolist()
+        # tokens = np.array(tokens)[ids.cpu().numpy()].tolist()
+
+        conn_index = []
+        slen_index = []
+        wlen_index = []
+        action_index = []
+        where_op_index = []
+        select_agg_index = []
+        header_pos_index = []
+        query_index = []
+        for ib, elem in enumerate(start_index):
+            # [SEP] conn [SEP] wlen [SEP] (wop [SEP])*wn slen [SEP] (agg [SEP])*sn
+            action_index.append(elem + 1)
+            conn_index.append(elem + 2)
+            wlen_index.append(elem + 3)
+            woi = [elem + 4 + i for i in range(self.max_where_num)]
+
+            slen_index.append(elem + 4 + self.max_where_num)
+            sai = [
+                elem + 5 + self.max_where_num + i
+                for i in range(self.max_select_num)
+            ]
+            where_op_index.append(woi)
+            select_agg_index.append(sai)
+
+            qilist = [i for i in range(l_n[ib] + 2)] + [l_n[ib] + 1] * (
+                max_l_n - l_n[ib])
+            query_index.append(qilist)
+
+            index = [column_index[ib] + i for i in range(0, l_hs[ib], 1)]
+            index += [index[0] for _ in range(max_l_hs - len(index))]
+            header_pos_index.append(index)
+
+        # print("tokens: ", tokens)
+        # print("conn_index: ", conn_index, "start_index: ", start_index)
+        conn_index = torch.tensor(conn_index, dtype=torch.long).to(self.device)
+        slen_index = torch.tensor(slen_index, dtype=torch.long).to(self.device)
+        wlen_index = torch.tensor(wlen_index, dtype=torch.long).to(self.device)
+        action_index = torch.tensor(
+            action_index, dtype=torch.long).to(self.device)
+        where_op_index = torch.tensor(
+            where_op_index, dtype=torch.long).to(self.device)
+        select_agg_index = torch.tensor(
+            select_agg_index, dtype=torch.long).to(self.device)
+        query_index = torch.tensor(
+            query_index, dtype=torch.long).to(self.device)
+        header_index = torch.tensor(
+            header_pos_index, dtype=torch.long).to(self.device)
+
+        bS = len(l_n)
+        conn_emb = torch.zeros([bS, self.iS]).to(self.device)
+        slen_emb = torch.zeros([bS, self.iS]).to(self.device)
+        wlen_emb = torch.zeros([bS, self.iS]).to(self.device)
+        action_emb = torch.zeros([bS, self.iS]).to(self.device)
+        wo_emb = torch.zeros([bS, self.max_where_num, self.iS]).to(self.device)
+        sa_emb = torch.zeros([bS, self.max_select_num,
+                              self.iS]).to(self.device)
+        qv_emb = torch.zeros([bS, max_l_n + 2, self.iS]).to(self.device)
+        ht_emb = torch.zeros([bS, max_l_hs, self.iS]).to(self.device)
+        for i in range(bS):
+            conn_emb[i, :] = wemb_layer[i].index_select(0, conn_index[i])
+            slen_emb[i, :] = wemb_layer[i].index_select(0, slen_index[i])
+            wlen_emb[i, :] = wemb_layer[i].index_select(0, wlen_index[i])
+            action_emb[i, :] = wemb_layer[i].index_select(0, action_index[i])
+
+            wo_emb[i, :, :] = wemb_layer[i].index_select(
+                0, where_op_index[i, :])
+            sa_emb[i, :, :] = wemb_layer[i].index_select(
+                0, select_agg_index[i, :])
+            qv_emb[i, :, :] = wemb_layer[i].index_select(0, query_index[i, :])
+            ht_emb[i, :, :] = wemb_layer[i].index_select(0, header_index[i, :])
+
+        s_cco = self.conn_model(conn_emb.reshape(-1, self.iS)).reshape(bS, 3)
+        s_slen = self.slen_model(slen_emb.reshape(-1, self.iS)).reshape(
+            bS, self.max_select_num + 1)
+        s_wlen = self.wlen_model(wlen_emb.reshape(-1, self.iS)).reshape(
+            bS, self.max_where_num + 1)
+        s_action = self.action_model(action_emb.reshape(-1, self.iS)).reshape(
+            bS, self.n_action_ops + 1)
+        wo_output = self.w_op_model(
+            wo_emb.reshape(-1, self.iS * self.max_where_num)).reshape(
+                bS, -1, self.n_cond_ops)
+
+        wc_output = self.wc_ht_model(ht_emb.reshape(-1, self.iS)).reshape(
+            bS, -1, self.max_where_num).transpose(1, 2)
+
+        wv_ss = self.w_sss_model(qv_emb.reshape(-1, self.iS)).reshape(
+            bS, -1, self.max_where_num).transpose(1, 2)
+        wv_se = self.w_sse_model(qv_emb.reshape(-1, self.iS)).reshape(
+            bS, -1, self.max_where_num).transpose(1, 2)
+
+        sc_output = self.s_ht_model(ht_emb.reshape(-1, self.iS)).reshape(
+            bS, -1, self.max_select_num).transpose(1, 2)
+        sa_output = self.select_agg_model(
+            sa_emb.reshape(-1, self.iS * self.max_select_num)).reshape(
+                bS, -1, self.n_agg_ops)
+
+        return s_action, sc_output, sa_output, s_cco, wc_output, wo_output, (
+            wv_ss, wv_se), (s_slen, s_wlen)
diff --git a/modelscope/models/nlp/structbert/configuration_sbert.py b/modelscope/models/nlp/structbert/configuration_sbert.py
index 374d4b62..a727a978 100644
--- a/modelscope/models/nlp/structbert/configuration_sbert.py
+++ b/modelscope/models/nlp/structbert/configuration_sbert.py
@@ -85,7 +85,7 @@ class SbertConfig(PretrainedConfig):
             If adv_bound not proveded, 2 * sigma will be used as the adv_bound factor
     """
 
-    model_type = 'sbert'
+    model_type = 'structbert'
 
     def __init__(self,
                  vocab_size=30522,
diff --git a/modelscope/models/nlp/table_question_answering.py b/modelscope/models/nlp/table_question_answering.py
new file mode 100644
index 00000000..3c91a518
--- /dev/null
+++ b/modelscope/models/nlp/table_question_answering.py
@@ -0,0 +1,745 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+from typing import Dict
+
+import numpy
+import torch
+import torch.nn.functional as F
+from transformers import BertTokenizer
+
+from modelscope.metainfo import Models
+from modelscope.models.base import Model, Tensor
+from modelscope.models.builder import MODELS
+from modelscope.models.nlp.star3.configuration_star3 import Star3Config
+from modelscope.models.nlp.star3.modeling_star3 import Seq2SQL, Star3Model
+from modelscope.preprocessors.star3.fields.struct import Constant
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.device import verify_device
+
+__all__ = ['TableQuestionAnswering']
+
+
+@MODELS.register_module(
+    Tasks.table_question_answering, module_name=Models.star3)
+class TableQuestionAnswering(Model):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the table-question-answering model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+        """
+        super().__init__(model_dir, *args, **kwargs)
+        self.tokenizer = BertTokenizer(
+            os.path.join(model_dir, ModelFile.VOCAB_FILE))
+        device_name = kwargs.get('device', 'gpu')
+        verify_device(device_name)
+        self._device_name = device_name
+
+        state_dict = torch.load(
+            os.path.join(self.model_dir, ModelFile.TORCH_MODEL_BIN_FILE),
+            map_location='cpu')
+
+        self.backbone_config = Star3Config.from_json_file(
+            os.path.join(self.model_dir, ModelFile.CONFIGURATION))
+        self.backbone_model = Star3Model(
+            config=self.backbone_config, schema_link_module='rat')
+        self.backbone_model.load_state_dict(state_dict['backbone_model'])
+
+        constant = Constant()
+        self.agg_ops = constant.agg_ops
+        self.cond_ops = constant.cond_ops
+        self.cond_conn_ops = constant.cond_conn_ops
+        self.action_ops = constant.action_ops
+        self.max_select_num = constant.max_select_num
+        self.max_where_num = constant.max_where_num
+        self.col_type_dict = constant.col_type_dict
+        self.schema_link_dict = constant.schema_link_dict
+        n_cond_ops = len(self.cond_ops)
+        n_agg_ops = len(self.agg_ops)
+        n_action_ops = len(self.action_ops)
+        iS = self.backbone_config.hidden_size
+        self.head_model = Seq2SQL(iS, 100, 2, 0.0, n_cond_ops, n_agg_ops,
+                                  n_action_ops, self.max_select_num,
+                                  self.max_where_num, self._device_name)
+        self.head_model.load_state_dict(state_dict['head_model'], strict=False)
+
+        self.backbone_model.to(self._device_name)
+        self.head_model.to(self._device_name)
+
+    def convert_string(self, pr_wvi, nlu, nlu_tt):
+        convs = []
+        for b, nlu1 in enumerate(nlu):
+            conv_dict = {}
+            nlu_tt1 = nlu_tt[b]
+            idx = 0
+            convflag = True
+            for i, ntok in enumerate(nlu_tt1):
+                if idx >= len(nlu1):
+                    convflag = False
+                    break
+
+                if ntok.startswith('##'):
+                    ntok = ntok.replace('##', '')
+
+                tok = nlu1[idx:idx + 1].lower()
+                if ntok == tok:
+                    conv_dict[i] = [idx, idx + 1]
+                    idx += 1
+                elif ntok == '#':
+                    conv_dict[i] = [idx, idx]
+                elif ntok == '[UNK]':
+                    conv_dict[i] = [idx, idx + 1]
+                    j = i + 1
+                    idx += 1
+                    if idx < len(nlu1) and j < len(
+                            nlu_tt1) and nlu_tt1[j] != '[UNK]':
+                        while idx < len(nlu1):
+                            val = nlu1[idx:idx + 1].lower()
+                            if nlu_tt1[j].startswith(val):
+                                break
+                            idx += 1
+                        conv_dict[i][1] = idx
+                elif tok in ntok:
+                    startid = idx
+                    idx += 1
+                    while idx < len(nlu1):
+                        tok += nlu1[idx:idx + 1].lower()
+                        if ntok == tok:
+                            conv_dict[i] = [startid, idx + 1]
+                            break
+                        idx += 1
+                    idx += 1
+                else:
+                    convflag = False
+
+            conv = []
+            if convflag:
+                for pr_wvi1 in pr_wvi[b]:
+                    s1, e1 = conv_dict[pr_wvi1[0]]
+                    s2, e2 = conv_dict[pr_wvi1[1]]
+                    newidx = pr_wvi1[1]
+                    while newidx + 1 < len(
+                            nlu_tt1) and s2 == e2 and nlu_tt1[newidx] == '#':
+                        newidx += 1
+                        s2, e2 = conv_dict[newidx]
+                    if newidx + 1 < len(nlu_tt1) and nlu_tt1[
+                            newidx + 1].startswith('##'):
+                        s2, e2 = conv_dict[newidx + 1]
+                    phrase = nlu1[s1:e2]
+                    conv.append(phrase)
+            else:
+                for pr_wvi1 in pr_wvi[b]:
+                    phrase = ''.join(nlu_tt1[pr_wvi1[0]:pr_wvi1[1]
+                                             + 1]).replace('##', '')
+                    conv.append(phrase)
+            convs.append(conv)
+
+        return convs
+
+    def get_fields_info(self, t1s, tables, train=True):
+        nlu, nlu_t, sql_i, q_know, t_know, action, hs_t, types, units, his_sql, schema_link = \
+            [], [], [], [], [], [], [], [], [], [], []
+        for t1 in t1s:
+            nlu.append(t1['question'])
+            nlu_t.append(t1['question_tok'])
+            hs_t.append(t1['header_tok'])
+            q_know.append(t1['bertindex_knowledge'])
+            t_know.append(t1['header_knowledge'])
+            types.append(t1['types'])
+            units.append(t1['units'])
+            his_sql.append(t1.get('history_sql', None))
+            schema_link.append(t1.get('schema_link', []))
+            if train:
+                action.append(t1.get('action', [0]))
+                sql_i.append(t1['sql'])
+
+        return nlu, nlu_t, sql_i, q_know, t_know, action, hs_t, types, units, his_sql, schema_link
+
+    def get_history_select_where(self, his_sql, header_len):
+        if his_sql is None:
+            return [0], [0]
+
+        sel = []
+        for seli in his_sql['sel']:
+            if seli + 1 < header_len and seli + 1 not in sel:
+                sel.append(seli + 1)
+
+        whe = []
+        for condi in his_sql['conds']:
+            if condi[0] + 1 < header_len and condi[0] + 1 not in whe:
+                whe.append(condi[0] + 1)
+
+        if len(sel) == 0:
+            sel.append(0)
+        if len(whe) == 0:
+            whe.append(0)
+
+        sel.sort()
+        whe.sort()
+
+        return sel, whe
+
+    def get_types_ids(self, col_type):
+        for key, type_ids in self.col_type_dict.items():
+            if key in col_type.lower():
+                return type_ids
+        return self.col_type_dict['null']
+
+    def generate_inputs(self, nlu1_tok, hs_t_1, type_t, unit_t, his_sql,
+                        q_know, t_know, s_link):
+        tokens = []
+        orders = []
+        types = []
+        segment_ids = []
+        matchs = []
+        col_dict = {}
+        schema_tok = []
+
+        tokens.append('[CLS]')
+        orders.append(0)
+        types.append(0)
+        i_st_nlu = len(tokens)
+
+        matchs.append(0)
+        segment_ids.append(0)
+        for idx, token in enumerate(nlu1_tok):
+            if q_know[idx] == 100:
+                break
+            elif q_know[idx] >= 5:
+                matchs.append(1)
+            else:
+                matchs.append(q_know[idx] + 1)
+            tokens.append(token)
+            orders.append(0)
+            types.append(0)
+            segment_ids.append(0)
+
+        i_ed_nlu = len(tokens)
+
+        tokens.append('[SEP]')
+        orders.append(0)
+        types.append(0)
+        matchs.append(0)
+        segment_ids.append(0)
+
+        sel, whe = self.get_history_select_where(his_sql, len(hs_t_1))
+
+        if len(sel) == 1 and sel[0] == 0 \
+                and len(whe) == 1 and whe[0] == 0:
+            pass
+        else:
+            tokens.append('select')
+            orders.append(0)
+            types.append(0)
+            matchs.append(10)
+            segment_ids.append(0)
+
+            for seli in sel:
+                tokens.append('[PAD]')
+                orders.append(0)
+                types.append(0)
+                matchs.append(10)
+                segment_ids.append(0)
+                col_dict[len(tokens) - 1] = seli
+
+            tokens.append('where')
+            orders.append(0)
+            types.append(0)
+            matchs.append(10)
+            segment_ids.append(0)
+
+            for whei in whe:
+                tokens.append('[PAD]')
+                orders.append(0)
+                types.append(0)
+                matchs.append(10)
+                segment_ids.append(0)
+                col_dict[len(tokens) - 1] = whei
+
+            tokens.append('[SEP]')
+            orders.append(0)
+            types.append(0)
+            matchs.append(10)
+            segment_ids.append(0)
+
+        column_start = len(tokens)
+        i_hds_f = []
+        header_flatten_tokens, header_flatten_index = [], []
+        for i, hds11 in enumerate(hs_t_1):
+            if len(unit_t[i]) == 1 and unit_t[i][0] == 'null':
+                temp_header_tokens = hds11
+            else:
+                temp_header_tokens = hds11 + unit_t[i]
+            schema_tok.append(temp_header_tokens)
+            header_flatten_tokens.extend(temp_header_tokens)
+            header_flatten_index.extend([i + 1] * len(temp_header_tokens))
+            i_st_hd_f = len(tokens)
+            tokens += ['[PAD]']
+            orders.append(0)
+            types.append(self.get_types_ids(type_t[i]))
+            i_ed_hd_f = len(tokens)
+            col_dict[len(tokens) - 1] = i
+            i_hds_f.append((i_st_hd_f, i_ed_hd_f))
+            if i == 0:
+                matchs.append(6)
+            else:
+                matchs.append(t_know[i - 1] + 6)
+            segment_ids.append(1)
+
+        tokens.append('[SEP]')
+        orders.append(0)
+        types.append(0)
+        matchs.append(0)
+        segment_ids.append(1)
+
+        # position where
+        # [SEP]
+        start_ids = len(tokens) - 1
+
+        tokens.append('action')  # action
+        orders.append(1)
+        types.append(0)
+        matchs.append(0)
+        segment_ids.append(1)
+
+        tokens.append('connect')  # column
+        orders.append(1)
+        types.append(0)
+        matchs.append(0)
+        segment_ids.append(1)
+
+        tokens.append('allen')  # select len
+        orders.append(1)
+        types.append(0)
+        matchs.append(0)
+        segment_ids.append(1)
+
+        for x in range(self.max_where_num):
+            tokens.append('act')  # op
+            orders.append(2 + x)
+            types.append(0)
+            matchs.append(0)
+            segment_ids.append(1)
+
+        tokens.append('size')  # where len
+        orders.append(1)
+        types.append(0)
+        matchs.append(0)
+        segment_ids.append(1)
+
+        for x in range(self.max_select_num):
+            tokens.append('focus')  # agg
+            orders.append(2 + x)
+            types.append(0)
+            matchs.append(0)
+            segment_ids.append(1)
+
+        i_nlu = (i_st_nlu, i_ed_nlu)
+
+        schema_link_matrix = numpy.zeros((len(tokens), len(tokens)),
+                                         dtype='int32')
+        schema_link_mask = numpy.zeros((len(tokens), len(tokens)),
+                                       dtype='float32')
+        for relation in s_link:
+            if relation['label'] in ['col', 'val']:
+                [q_st, q_ed] = relation['question_index']
+                cid = max(0, relation['column_index'])
+                schema_link_matrix[
+                    i_st_nlu + q_st: i_st_nlu + q_ed + 1,
+                    column_start + cid + 1: column_start + cid + 1 + 1] = \
+                    self.schema_link_dict[relation['label'] + '_middle']
+                schema_link_matrix[
+                    i_st_nlu + q_st,
+                    column_start + cid + 1: column_start + cid + 1 + 1] = \
+                    self.schema_link_dict[relation['label'] + '_start']
+                schema_link_matrix[
+                    i_st_nlu + q_ed,
+                    column_start + cid + 1: column_start + cid + 1 + 1] = \
+                    self.schema_link_dict[relation['label'] + '_end']
+                schema_link_mask[i_st_nlu + q_st:i_st_nlu + q_ed + 1,
+                                 column_start + cid + 1:column_start + cid + 1
+                                 + 1] = 1.0
+
+        return tokens, orders, types, segment_ids, matchs, \
+            i_nlu, i_hds_f, start_ids, column_start, col_dict, schema_tok, \
+            header_flatten_tokens, header_flatten_index, schema_link_matrix, schema_link_mask
+
+    def gen_l_hpu(self, i_hds):
+        """
+        Treat columns as if it is a batch of natural language utterance
+        with batch-size = # of columns * # of batch_size
+        i_hds = [(17, 18), (19, 21), (22, 23), (24, 25), (26, 29), (30, 34)])
+        """
+        l_hpu = []
+        for i_hds1 in i_hds:
+            for i_hds11 in i_hds1:
+                l_hpu.append(i_hds11[1] - i_hds11[0])
+
+        return l_hpu
+
+    def get_bert_output(self, model_bert, tokenizer, nlu_t, hs_t, col_types,
+                        units, his_sql, q_know, t_know, schema_link):
+        """
+        Here, input is toknized further by WordPiece (WP) tokenizer and fed into BERT.
+
+        INPUT
+        :param model_bert:
+        :param tokenizer: WordPiece toknizer
+        :param nlu: Question
+        :param nlu_t: CoreNLP tokenized nlu.
+        :param hds: Headers
+        :param hs_t: None or 1st-level tokenized headers
+        :param max_seq_length: max input token length
+
+        OUTPUT
+        tokens: BERT input tokens
+        nlu_tt: WP-tokenized input natural language questions
+        orig_to_tok_index: map the index of 1st-level-token to the index of 2nd-level-token
+        tok_to_orig_index: inverse map.
+
+        """
+
+        l_n = []
+        l_hs = []  # The length of columns for each batch
+
+        input_ids = []
+        order_ids = []
+        type_ids = []
+        segment_ids = []
+        match_ids = []
+        input_mask = []
+
+        i_nlu = [
+        ]  # index to retreive the position of contextual vector later.
+        i_hds = []
+        tokens = []
+        orders = []
+        types = []
+        matchs = []
+        segments = []
+        schema_link_matrix_list = []
+        schema_link_mask_list = []
+        start_index = []
+        column_index = []
+        col_dict_list = []
+        header_list = []
+        header_flatten_token_list = []
+        header_flatten_tokenid_list = []
+        header_flatten_index_list = []
+
+        header_tok_max_len = 0
+        cur_max_length = 0
+
+        for b, nlu_t1 in enumerate(nlu_t):
+            hs_t1 = [hs_t[b][-1]] + hs_t[b][:-1]
+            type_t1 = [col_types[b][-1]] + col_types[b][:-1]
+            unit_t1 = [units[b][-1]] + units[b][:-1]
+            l_hs.append(len(hs_t1))
+
+            # [CLS] nlu [SEP] col1 [SEP] col2 [SEP] ...col-n [SEP]
+            # 2. Generate BERT inputs & indices.
+            tokens1, orders1, types1, segment1, match1, i_nlu1, i_hds_1, \
+                start_idx, column_start, col_dict, schema_tok, \
+                header_flatten_tokens, header_flatten_index, schema_link_matrix, schema_link_mask = \
+                self.generate_inputs(
+                    nlu_t1, hs_t1, type_t1, unit_t1, his_sql[b],
+                    q_know[b], t_know[b], schema_link[b])
+
+            l_n.append(i_nlu1[1] - i_nlu1[0])
+            start_index.append(start_idx)
+            column_index.append(column_start)
+            col_dict_list.append(col_dict)
+            tokens.append(tokens1)
+            orders.append(orders1)
+            types.append(types1)
+            segments.append(segment1)
+            matchs.append(match1)
+            i_nlu.append(i_nlu1)
+            i_hds.append(i_hds_1)
+            schema_link_matrix_list.append(schema_link_matrix)
+            schema_link_mask_list.append(schema_link_mask)
+            header_flatten_token_list.append(header_flatten_tokens)
+            header_flatten_index_list.append(header_flatten_index)
+            header_list.append(schema_tok)
+            header_max = max([len(schema_tok1) for schema_tok1 in schema_tok])
+            if header_max > header_tok_max_len:
+                header_tok_max_len = header_max
+
+            if len(tokens1) > cur_max_length:
+                cur_max_length = len(tokens1)
+
+            if len(tokens1) > 512:
+                print('input too long!!! total_num:%d\t question:%s' %
+                      (len(tokens1), ''.join(nlu_t1)))
+
+        assert cur_max_length <= 512
+
+        for i, tokens1 in enumerate(tokens):
+            segment_ids1 = segments[i]
+            order_ids1 = orders[i]
+            type_ids1 = types[i]
+            match_ids1 = matchs[i]
+            input_ids1 = tokenizer.convert_tokens_to_ids(tokens1)
+            input_mask1 = [1] * len(input_ids1)
+
+            while len(input_ids1) < cur_max_length:
+                input_ids1.append(0)
+                input_mask1.append(0)
+                segment_ids1.append(0)
+                order_ids1.append(0)
+                type_ids1.append(0)
+                match_ids1.append(0)
+
+            if len(input_ids1) != cur_max_length:
+                print('Error: ', nlu_t1, tokens1, len(input_ids1),
+                      cur_max_length)
+
+            assert len(input_ids1) == cur_max_length
+            assert len(input_mask1) == cur_max_length
+            assert len(order_ids1) == cur_max_length
+            assert len(segment_ids1) == cur_max_length
+            assert len(match_ids1) == cur_max_length
+            assert len(type_ids1) == cur_max_length
+
+            input_ids.append(input_ids1)
+            order_ids.append(order_ids1)
+            type_ids.append(type_ids1)
+            segment_ids.append(segment_ids1)
+            input_mask.append(input_mask1)
+            match_ids.append(match_ids1)
+
+        header_len = []
+        header_ids = []
+        header_max_len = max(
+            [len(header_list1) for header_list1 in header_list])
+        for header1 in header_list:
+            header_len1 = []
+            header_ids1 = []
+            for header_tok in header1:
+                header_len1.append(len(header_tok))
+                header_tok_ids1 = tokenizer.convert_tokens_to_ids(header_tok)
+                while len(header_tok_ids1) < header_tok_max_len:
+                    header_tok_ids1.append(0)
+                header_ids1.append(header_tok_ids1)
+            while len(header_ids1) < header_max_len:
+                header_ids1.append([0] * header_tok_max_len)
+            header_len.append(header_len1)
+            header_ids.append(header_ids1)
+
+        for i, header_flatten_token in enumerate(header_flatten_token_list):
+            header_flatten_tokenid = tokenizer.convert_tokens_to_ids(
+                header_flatten_token)
+            header_flatten_tokenid_list.append(header_flatten_tokenid)
+
+        # Convert to tensor
+        all_input_ids = torch.tensor(
+            input_ids, dtype=torch.long).to(self._device_name)
+        all_order_ids = torch.tensor(
+            order_ids, dtype=torch.long).to(self._device_name)
+        all_type_ids = torch.tensor(
+            type_ids, dtype=torch.long).to(self._device_name)
+        all_input_mask = torch.tensor(
+            input_mask, dtype=torch.long).to(self._device_name)
+        all_segment_ids = torch.tensor(
+            segment_ids, dtype=torch.long).to(self._device_name)
+        all_match_ids = torch.tensor(
+            match_ids, dtype=torch.long).to(self._device_name)
+        all_header_ids = torch.tensor(
+            header_ids, dtype=torch.long).to(self._device_name)
+        all_ids = torch.arange(
+            all_input_ids.shape[0], dtype=torch.long).to(self._device_name)
+
+        bS = len(header_flatten_tokenid_list)
+        max_header_flatten_token_length = max(
+            [len(x) for x in header_flatten_tokenid_list])
+        all_header_flatten_tokens = numpy.zeros(
+            (bS, max_header_flatten_token_length), dtype='int32')
+        all_header_flatten_index = numpy.zeros(
+            (bS, max_header_flatten_token_length), dtype='int32')
+        for i, header_flatten_tokenid in enumerate(
+                header_flatten_tokenid_list):
+            for j, tokenid in enumerate(header_flatten_tokenid):
+                all_header_flatten_tokens[i, j] = tokenid
+            for j, hdindex in enumerate(header_flatten_index_list[i]):
+                all_header_flatten_index[i, j] = hdindex
+        all_header_flatten_output = numpy.zeros((bS, header_max_len + 1),
+                                                dtype='int32')
+        all_header_flatten_tokens = torch.tensor(
+            all_header_flatten_tokens, dtype=torch.long).to(self._device_name)
+        all_header_flatten_index = torch.tensor(
+            all_header_flatten_index, dtype=torch.long).to(self._device_name)
+        all_header_flatten_output = torch.tensor(
+            all_header_flatten_output,
+            dtype=torch.float32).to(self._device_name)
+
+        all_token_column_id = numpy.zeros((bS, cur_max_length), dtype='int32')
+        all_token_column_mask = numpy.zeros((bS, cur_max_length),
+                                            dtype='float32')
+        for bi, col_dict in enumerate(col_dict_list):
+            for ki, vi in col_dict.items():
+                all_token_column_id[bi, ki] = vi + 1
+                all_token_column_mask[bi, ki] = 1.0
+        all_token_column_id = torch.tensor(
+            all_token_column_id, dtype=torch.long).to(self._device_name)
+        all_token_column_mask = torch.tensor(
+            all_token_column_mask, dtype=torch.float32).to(self._device_name)
+
+        all_schema_link_matrix = numpy.zeros(
+            (bS, cur_max_length, cur_max_length), dtype='int32')
+        all_schema_link_mask = numpy.zeros(
+            (bS, cur_max_length, cur_max_length), dtype='float32')
+        for i, schema_link_matrix in enumerate(schema_link_matrix_list):
+            temp_len = schema_link_matrix.shape[0]
+            all_schema_link_matrix[i, 0:temp_len,
+                                   0:temp_len] = schema_link_matrix
+            all_schema_link_mask[i, 0:temp_len,
+                                 0:temp_len] = schema_link_mask_list[i]
+        all_schema_link_matrix = torch.tensor(
+            all_schema_link_matrix, dtype=torch.long).to(self._device_name)
+        all_schema_link_mask = torch.tensor(
+            all_schema_link_mask, dtype=torch.long).to(self._device_name)
+
+        # 5. generate l_hpu from i_hds
+        l_hpu = self.gen_l_hpu(i_hds)
+
+        # 4. Generate BERT output.
+        all_encoder_layer, pooled_output = model_bert(
+            all_input_ids,
+            all_header_ids,
+            token_order_ids=all_order_ids,
+            token_type_ids=all_segment_ids,
+            attention_mask=all_input_mask,
+            match_type_ids=all_match_ids,
+            l_hs=l_hs,
+            header_len=header_len,
+            type_ids=all_type_ids,
+            col_dict_list=col_dict_list,
+            ids=all_ids,
+            header_flatten_tokens=all_header_flatten_tokens,
+            header_flatten_index=all_header_flatten_index,
+            header_flatten_output=all_header_flatten_output,
+            token_column_id=all_token_column_id,
+            token_column_mask=all_token_column_mask,
+            column_start_index=column_index,
+            headers_length=l_hs,
+            all_schema_link_matrix=all_schema_link_matrix,
+            all_schema_link_mask=all_schema_link_mask,
+            output_all_encoded_layers=False)
+
+        return all_encoder_layer, pooled_output, tokens, i_nlu, i_hds, \
+            l_n, l_hpu, l_hs, start_index, column_index, all_ids
+
+    def predict(self, querys):
+        self.head_model.eval()
+        self.backbone_model.eval()
+
+        nlu, nlu_t, sql_i, q_know, t_know, tb, hs_t, types, units, his_sql, schema_link = \
+            self.get_fields_info(querys, None, train=False)
+
+        with torch.no_grad():
+            all_encoder_layer, _, tokens, i_nlu, i_hds, l_n, l_hpu, l_hs, start_index, column_index, ids = \
+                self.get_bert_output(
+                    self.backbone_model, self.tokenizer,
+                    nlu_t, hs_t, types, units, his_sql, q_know, t_know, schema_link)
+
+            s_action, s_sc, s_sa, s_cco, s_wc, s_wo, s_wvs, s_len = self.head_model(
+                all_encoder_layer, l_n, l_hs, start_index, column_index,
+                tokens, ids)
+
+        action_batch = torch.argmax(F.softmax(s_action, -1), -1).cpu().tolist()
+        scco_batch = torch.argmax(F.softmax(s_cco, -1), -1).cpu().tolist()
+        sc_batch = torch.argmax(F.softmax(s_sc, -1), -1).cpu().tolist()
+        sa_batch = torch.argmax(F.softmax(s_sa, -1), -1).cpu().tolist()
+        wc_batch = torch.argmax(F.softmax(s_wc, -1), -1).cpu().tolist()
+        wo_batch = torch.argmax(F.softmax(s_wo, -1), -1).cpu().tolist()
+        s_wvs_s, s_wvs_e = s_wvs
+        wvss_batch = torch.argmax(F.softmax(s_wvs_s, -1), -1).cpu().tolist()
+        wvse_batch = torch.argmax(F.softmax(s_wvs_e, -1), -1).cpu().tolist()
+        s_slen, s_wlen = s_len
+        slen_batch = torch.argmax(F.softmax(s_slen, -1), -1).cpu().tolist()
+        wlen_batch = torch.argmax(F.softmax(s_wlen, -1), -1).cpu().tolist()
+
+        pr_wvi = []
+        for i in range(len(querys)):
+            wvi = []
+            for j in range(wlen_batch[i]):
+                wvi.append([
+                    max(0, wvss_batch[i][j] - 1),
+                    max(0, wvse_batch[i][j] - 1)
+                ])
+            pr_wvi.append(wvi)
+        pr_wvi_str = self.convert_string(pr_wvi, nlu, nlu_t)
+
+        pre_results = []
+        for ib in range(len(querys)):
+            res_one = {}
+            sql = {}
+            sql['cond_conn_op'] = scco_batch[ib]
+            sl = slen_batch[ib]
+            sql['sel'] = list(
+                numpy.array(sc_batch[ib][:sl]).astype(numpy.int32) - 1)
+            sql['agg'] = list(
+                numpy.array(sa_batch[ib][:sl]).astype(numpy.int32))
+            sels = []
+            aggs = []
+            for ia, sel in enumerate(sql['sel']):
+                if sel == -1:
+                    if sql['agg'][ia] > 0:
+                        sels.append(l_hs[ib] - 1)
+                        aggs.append(sql['agg'][ia])
+                    continue
+                sels.append(sel)
+                if sql['agg'][ia] == -1:
+                    aggs.append(0)
+                else:
+                    aggs.append(sql['agg'][ia])
+            if len(sels) == 0:
+                sels.append(l_hs[ib] - 1)
+                aggs.append(0)
+            assert len(sels) == len(aggs)
+            sql['sel'] = sels
+            sql['agg'] = aggs
+
+            conds = []
+            wl = wlen_batch[ib]
+            wc_os = list(
+                numpy.array(wc_batch[ib][:wl]).astype(numpy.int32) - 1)
+            wo_os = list(numpy.array(wo_batch[ib][:wl]).astype(numpy.int32))
+            res_one['question_tok'] = querys[ib]['question_tok']
+            for i in range(wl):
+                if wc_os[i] == -1:
+                    continue
+                conds.append([wc_os[i], wo_os[i], pr_wvi_str[ib][i]])
+            if len(conds) == 0:
+                conds.append([l_hs[ib] - 1, 2, 'Nulll'])
+            sql['conds'] = conds
+            res_one['question'] = querys[ib]['question']
+            res_one['table_id'] = querys[ib]['table_id']
+            res_one['sql'] = sql
+            res_one['action'] = action_batch[ib]
+            res_one['model_out'] = [
+                sc_batch[ib], sa_batch[ib], wc_batch[ib], wo_batch[ib],
+                wvss_batch[ib], wvse_batch[ib]
+            ]
+            pre_results.append(res_one)
+
+        return pre_results
+
+    def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
+        """return the result by the model
+
+        Args:
+            input (Dict[str, Tensor]): the preprocessed data
+
+        Returns:
+            Dict[str, Tensor]: results
+                Example:
+        """
+        result = self.predict(input['datas'])[0]
+
+        return {
+            'result': result,
+            'history_sql': input['datas'][0]['history_sql']
+        }
diff --git a/modelscope/models/nlp/task_models/__init__.py b/modelscope/models/nlp/task_models/__init__.py
index 49cf0ee4..7493ba74 100644
--- a/modelscope/models/nlp/task_models/__init__.py
+++ b/modelscope/models/nlp/task_models/__init__.py
@@ -7,12 +7,14 @@ if TYPE_CHECKING:
     from .information_extraction import InformationExtractionModel
     from .sequence_classification import SequenceClassificationModel
     from .task_model import SingleBackboneTaskModelBase
+    from .token_classification import TokenClassificationModel
 
 else:
     _import_structure = {
         'information_extraction': ['InformationExtractionModel'],
         'sequence_classification': ['SequenceClassificationModel'],
         'task_model': ['SingleBackboneTaskModelBase'],
+        'token_classification': ['TokenClassificationModel'],
     }
 
     import sys
diff --git a/modelscope/models/nlp/task_models/sequence_classification.py b/modelscope/models/nlp/task_models/sequence_classification.py
index 988f2917..80bfd476 100644
--- a/modelscope/models/nlp/task_models/sequence_classification.py
+++ b/modelscope/models/nlp/task_models/sequence_classification.py
@@ -48,7 +48,7 @@ class SequenceClassificationModel(SingleBackboneTaskModelBase):
         self.build_backbone(backbone_cfg)
         self.build_head(head_cfg)
 
-    def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]:
+    def forward(self, **input: Dict[str, Any]) -> Dict[str, np.ndarray]:
         outputs = super().forward(input)
         sequence_output, pooled_output = self.extract_backbone_outputs(outputs)
         outputs = self.head.forward(pooled_output)
diff --git a/modelscope/models/nlp/task_models/token_classification.py b/modelscope/models/nlp/task_models/token_classification.py
new file mode 100644
index 00000000..29679838
--- /dev/null
+++ b/modelscope/models/nlp/task_models/token_classification.py
@@ -0,0 +1,83 @@
+from typing import Any, Dict
+
+import numpy as np
+import torch
+
+from modelscope.metainfo import TaskModels
+from modelscope.models.builder import MODELS
+from modelscope.models.nlp.task_models.task_model import \
+    SingleBackboneTaskModelBase
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import Tasks
+from modelscope.utils.hub import parse_label_mapping
+from modelscope.utils.tensor_utils import (torch_nested_detach,
+                                           torch_nested_numpify)
+
+__all__ = ['TokenClassificationModel']
+
+
+@MODELS.register_module(
+    Tasks.token_classification, module_name=TaskModels.token_classification)
+class TokenClassificationModel(SingleBackboneTaskModelBase):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the token classification model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+        """
+        super().__init__(model_dir, *args, **kwargs)
+        if 'base_model_prefix' in kwargs:
+            self._base_model_prefix = kwargs['base_model_prefix']
+
+        backbone_cfg = self.cfg.backbone
+        head_cfg = self.cfg.head
+
+        # get the num_labels
+        num_labels = kwargs.get('num_labels')
+        if num_labels is None:
+            label2id = parse_label_mapping(model_dir)
+            if label2id is not None and len(label2id) > 0:
+                num_labels = len(label2id)
+            self.id2label = {id: label for label, id in label2id.items()}
+        head_cfg['num_labels'] = num_labels
+
+        self.build_backbone(backbone_cfg)
+        self.build_head(head_cfg)
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]:
+        labels = None
+        if OutputKeys.LABEL in input:
+            labels = input.pop(OutputKeys.LABEL)
+        elif OutputKeys.LABELS in input:
+            labels = input.pop(OutputKeys.LABELS)
+
+        outputs = super().forward(input)
+        sequence_output, pooled_output = self.extract_backbone_outputs(outputs)
+        outputs = self.head.forward(sequence_output)
+        if labels in input:
+            loss = self.compute_loss(outputs, labels)
+            outputs.update(loss)
+        return outputs
+
+    def extract_logits(self, outputs):
+        return outputs[OutputKeys.LOGITS].cpu().detach()
+
+    def extract_backbone_outputs(self, outputs):
+        sequence_output = None
+        pooled_output = None
+        if hasattr(self.backbone, 'extract_sequence_outputs'):
+            sequence_output = self.backbone.extract_sequence_outputs(outputs)
+        return sequence_output, pooled_output
+
+    def compute_loss(self, outputs, labels):
+        loss = self.head.compute_loss(outputs, labels)
+        return loss
+
+    def postprocess(self, input, **kwargs):
+        logits = self.extract_logits(input)
+        pred = torch.argmax(logits[0], dim=-1)
+        pred = torch_nested_numpify(torch_nested_detach(pred))
+        logits = torch_nested_numpify(torch_nested_detach(logits))
+        res = {OutputKeys.PREDICTIONS: pred, OutputKeys.LOGITS: logits}
+        return res
diff --git a/modelscope/models/nlp/token_classification.py b/modelscope/models/nlp/token_classification.py
index 59d7d0cf..0be921d0 100644
--- a/modelscope/models/nlp/token_classification.py
+++ b/modelscope/models/nlp/token_classification.py
@@ -91,6 +91,7 @@ class TokenClassification(TorchModel):
 
 
 @MODELS.register_module(Tasks.word_segmentation, module_name=Models.structbert)
+@MODELS.register_module(Tasks.part_of_speech, module_name=Models.structbert)
 @MODELS.register_module(
     Tasks.token_classification, module_name=Models.structbert)
 class SbertForTokenClassification(TokenClassification, SbertPreTrainedModel):
diff --git a/modelscope/msdatasets/cv/easycv_base.py b/modelscope/msdatasets/cv/easycv_base.py
new file mode 100644
index 00000000..a45827a3
--- /dev/null
+++ b/modelscope/msdatasets/cv/easycv_base.py
@@ -0,0 +1,36 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+
+
+class EasyCVBaseDataset(object):
+    """Adapt to MSDataset.
+
+    Args:
+        split_config (dict): Dataset root path from MSDataset, e.g.
+            {"train":"local cache path"} or {"evaluation":"local cache path"}.
+        preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
+            the model if supplied. Not support yet.
+        mode: Training or Evaluation.
+    """
+    DATA_ROOT_PATTERN = '${data_root}'
+
+    def __init__(self,
+                 split_config=None,
+                 preprocessor=None,
+                 mode=None,
+                 args=(),
+                 kwargs={}) -> None:
+        self.split_config = split_config
+        self.preprocessor = preprocessor
+        self.mode = mode
+        if self.split_config is not None:
+            self._update_data_source(kwargs['data_source'])
+
+    def _update_data_source(self, data_source):
+        data_root = next(iter(self.split_config.values()))
+        data_root = data_root.rstrip(osp.sep)
+
+        for k, v in data_source.items():
+            if isinstance(v, str) and self.DATA_ROOT_PATTERN in v:
+                data_source.update(
+                    {k: v.replace(self.DATA_ROOT_PATTERN, data_root)})
diff --git a/modelscope/msdatasets/cv/face_2d_keypoins/face_2d_keypoints_dataset.py b/modelscope/msdatasets/cv/face_2d_keypoins/face_2d_keypoints_dataset.py
index a902999d..2f2e03ef 100644
--- a/modelscope/msdatasets/cv/face_2d_keypoins/face_2d_keypoints_dataset.py
+++ b/modelscope/msdatasets/cv/face_2d_keypoins/face_2d_keypoints_dataset.py
@@ -2,6 +2,7 @@
 from easycv.datasets.face import FaceKeypointDataset as _FaceKeypointDataset
 
 from modelscope.metainfo import Datasets
+from modelscope.msdatasets.cv.easycv_base import EasyCVBaseDataset
 from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS
 from modelscope.utils.constant import Tasks
 
@@ -9,5 +10,28 @@ from modelscope.utils.constant import Tasks
 @TASK_DATASETS.register_module(
     group_key=Tasks.face_2d_keypoints,
     module_name=Datasets.Face2dKeypointsDataset)
-class FaceKeypointDataset(_FaceKeypointDataset):
-    """EasyCV dataset for face 2d keypoints."""
+class FaceKeypointDataset(EasyCVBaseDataset, _FaceKeypointDataset):
+    """EasyCV dataset for face 2d keypoints.
+
+    Args:
+        split_config (dict): Dataset root path from MSDataset, e.g.
+            {"train":"local cache path"} or {"evaluation":"local cache path"}.
+        preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
+            the model if supplied. Not support yet.
+        mode: Training or Evaluation.
+    """
+
+    def __init__(self,
+                 split_config=None,
+                 preprocessor=None,
+                 mode=None,
+                 *args,
+                 **kwargs) -> None:
+        EasyCVBaseDataset.__init__(
+            self,
+            split_config=split_config,
+            preprocessor=preprocessor,
+            mode=mode,
+            args=args,
+            kwargs=kwargs)
+        _FaceKeypointDataset.__init__(self, *args, **kwargs)
diff --git a/modelscope/msdatasets/cv/image_classification/classification_dataset.py b/modelscope/msdatasets/cv/image_classification/classification_dataset.py
index c7145f2b..ba73e472 100644
--- a/modelscope/msdatasets/cv/image_classification/classification_dataset.py
+++ b/modelscope/msdatasets/cv/image_classification/classification_dataset.py
@@ -2,6 +2,7 @@
 from easycv.datasets.classification import ClsDataset as _ClsDataset
 
 from modelscope.metainfo import Datasets
+from modelscope.msdatasets.cv.easycv_base import EasyCVBaseDataset
 from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS
 from modelscope.utils.constant import Tasks
 
@@ -10,10 +11,26 @@ from modelscope.utils.constant import Tasks
     group_key=Tasks.image_classification, module_name=Datasets.ClsDataset)
 class ClsDataset(_ClsDataset):
     """EasyCV dataset for classification.
-    For more details, please refer to :
-    https://github.com/alibaba/EasyCV/blob/master/easycv/datasets/classification/raw.py .
 
     Args:
-        data_source: Data source config to parse input data.
-        pipeline: Sequence of transform object or config dict to be composed.
+        split_config (dict): Dataset root path from MSDataset, e.g.
+            {"train":"local cache path"} or {"evaluation":"local cache path"}.
+        preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
+            the model if supplied. Not support yet.
+        mode: Training or Evaluation.
     """
+
+    def __init__(self,
+                 split_config=None,
+                 preprocessor=None,
+                 mode=None,
+                 *args,
+                 **kwargs) -> None:
+        EasyCVBaseDataset.__init__(
+            self,
+            split_config=split_config,
+            preprocessor=preprocessor,
+            mode=mode,
+            args=args,
+            kwargs=kwargs)
+        _ClsDataset.__init__(self, *args, **kwargs)
diff --git a/modelscope/msdatasets/cv/image_semantic_segmentation/segmentation_dataset.py b/modelscope/msdatasets/cv/image_semantic_segmentation/segmentation_dataset.py
index 21114c11..b1316e2e 100644
--- a/modelscope/msdatasets/cv/image_semantic_segmentation/segmentation_dataset.py
+++ b/modelscope/msdatasets/cv/image_semantic_segmentation/segmentation_dataset.py
@@ -2,20 +2,41 @@
 from easycv.datasets.segmentation import SegDataset as _SegDataset
 
 from modelscope.metainfo import Datasets
+from modelscope.msdatasets.cv.easycv_base import EasyCVBaseDataset
 from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS
 from modelscope.utils.constant import Tasks
 
 
 @TASK_DATASETS.register_module(
     group_key=Tasks.image_segmentation, module_name=Datasets.SegDataset)
-class SegDataset(_SegDataset):
+class SegDataset(EasyCVBaseDataset, _SegDataset):
     """EasyCV dataset for Sementic segmentation.
     For more details, please refer to :
     https://github.com/alibaba/EasyCV/blob/master/easycv/datasets/segmentation/raw.py .
 
     Args:
+        split_config (dict): Dataset root path from MSDataset, e.g.
+            {"train":"local cache path"} or {"evaluation":"local cache path"}.
+        preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
+            the model if supplied. Not support yet.
+        mode: Training or Evaluation.
         data_source: Data source config to parse input data.
         pipeline: Sequence of transform object or config dict to be composed.
         ignore_index (int): Label index to be ignored.
         profiling: If set True, will print transform time.
     """
+
+    def __init__(self,
+                 split_config=None,
+                 preprocessor=None,
+                 mode=None,
+                 *args,
+                 **kwargs) -> None:
+        EasyCVBaseDataset.__init__(
+            self,
+            split_config=split_config,
+            preprocessor=preprocessor,
+            mode=mode,
+            args=args,
+            kwargs=kwargs)
+        _SegDataset.__init__(self, *args, **kwargs)
diff --git a/modelscope/msdatasets/cv/object_detection/detection_dataset.py b/modelscope/msdatasets/cv/object_detection/detection_dataset.py
index 5b130a3e..2f6ad7d3 100644
--- a/modelscope/msdatasets/cv/object_detection/detection_dataset.py
+++ b/modelscope/msdatasets/cv/object_detection/detection_dataset.py
@@ -1,31 +1,54 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+
 from easycv.datasets.detection import DetDataset as _DetDataset
 from easycv.datasets.detection import \
     DetImagesMixDataset as _DetImagesMixDataset
 
 from modelscope.metainfo import Datasets
+from modelscope.msdatasets.cv.easycv_base import EasyCVBaseDataset
 from modelscope.msdatasets.task_datasets import TASK_DATASETS
 from modelscope.utils.constant import Tasks
 
 
 @TASK_DATASETS.register_module(
     group_key=Tasks.image_object_detection, module_name=Datasets.DetDataset)
-class DetDataset(_DetDataset):
+class DetDataset(EasyCVBaseDataset, _DetDataset):
     """EasyCV dataset for object detection.
     For more details, please refer to https://github.com/alibaba/EasyCV/blob/master/easycv/datasets/detection/raw.py .
 
     Args:
+        split_config (dict): Dataset root path from MSDataset, e.g.
+            {"train":"local cache path"} or {"evaluation":"local cache path"}.
+        preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
+            the model if supplied. Not support yet.
+        mode: Training or Evaluation.
         data_source: Data source config to parse input data.
         pipeline: Transform config list
         profiling: If set True, will print pipeline time
         classes: A list of class names, used in evaluation for result and groundtruth visualization
     """
 
+    def __init__(self,
+                 split_config=None,
+                 preprocessor=None,
+                 mode=None,
+                 *args,
+                 **kwargs) -> None:
+        EasyCVBaseDataset.__init__(
+            self,
+            split_config=split_config,
+            preprocessor=preprocessor,
+            mode=mode,
+            args=args,
+            kwargs=kwargs)
+        _DetDataset.__init__(self, *args, **kwargs)
+
 
 @TASK_DATASETS.register_module(
     group_key=Tasks.image_object_detection,
     module_name=Datasets.DetImagesMixDataset)
-class DetImagesMixDataset(_DetImagesMixDataset):
+class DetImagesMixDataset(EasyCVBaseDataset, _DetImagesMixDataset):
     """EasyCV dataset for object detection, a wrapper of multiple images mixed dataset.
     Suitable for training on multiple images mixed data augmentation like
     mosaic and mixup. For the augmentation pipeline of mixed image data,
@@ -38,6 +61,11 @@ class DetImagesMixDataset(_DetImagesMixDataset):
     For more details, please refer to https://github.com/alibaba/EasyCV/blob/master/easycv/datasets/detection/mix.py .
 
     Args:
+        split_config (dict): Dataset root path from MSDataset, e.g.
+            {"train":"local cache path"} or {"evaluation":"local cache path"}.
+        preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
+            the model if supplied. Not support yet.
+        mode: Training or Evaluation.
         data_source (:obj:`DetSourceCoco`): Data source config to parse input data.
         pipeline (Sequence[dict]): Sequence of transform object or
             config dict to be composed.
@@ -47,3 +75,18 @@ class DetImagesMixDataset(_DetImagesMixDataset):
             be skip pipeline. Default to None.
         label_padding: out labeling padding [N, 120, 5]
     """
+
+    def __init__(self,
+                 split_config=None,
+                 preprocessor=None,
+                 mode=None,
+                 *args,
+                 **kwargs) -> None:
+        EasyCVBaseDataset.__init__(
+            self,
+            split_config=split_config,
+            preprocessor=preprocessor,
+            mode=mode,
+            args=args,
+            kwargs=kwargs)
+        _DetImagesMixDataset.__init__(self, *args, **kwargs)
diff --git a/modelscope/msdatasets/ms_dataset.py b/modelscope/msdatasets/ms_dataset.py
index d0d0ab92..ca84db4f 100644
--- a/modelscope/msdatasets/ms_dataset.py
+++ b/modelscope/msdatasets/ms_dataset.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import math
 import os
 from typing import (Any, Callable, Dict, Iterable, List, Mapping, Optional,
@@ -220,18 +222,23 @@ class MsDataset:
         api = HubApi()
         download_dataset = ''
         if isinstance(dataset_name, str):
-            download_dataset = dataset_name
             dataset_formation = DatasetFormations.native
-            if dataset_name in _PACKAGED_DATASETS_MODULES or os.path.isdir(dataset_name) or \
-                    (os.path.isfile(dataset_name) and dataset_name.endswith('.py')):
+            if dataset_name in _PACKAGED_DATASETS_MODULES or os.path.isdir(
+                    dataset_name):
+                dataset_formation = DatasetFormations.hf_compatible
+            elif os.path.isfile(dataset_name) and dataset_name.endswith('.py'):
                 dataset_formation = DatasetFormations.hf_compatible
+                file_name = os.path.basename(dataset_name)
+                download_dataset = os.path.splitext(file_name)[0]
             elif is_relative_path(dataset_name) and dataset_name.count(
                     '/') == 0:
+                download_dataset = dataset_name
                 dataset_scripts, dataset_formation, download_dir = api.fetch_dataset_scripts(
                     dataset_name, namespace, download_mode, version)
                 # dataset organized to be compatible with hf format
                 if dataset_formation == DatasetFormations.hf_compatible:
                     dataset_name = dataset_scripts['.py'][0]
+                    download_dataset = dataset_name
             else:
                 raise FileNotFoundError(
                     f"Couldn't find a dataset script at {relative_to_absolute_path(dataset_name)} "
@@ -268,8 +275,11 @@ class MsDataset:
                             f' {type(dataset_name)}')
 
         if download_dataset:
-            api.on_dataset_download(
-                dataset_name=download_dataset, namespace=namespace)
+            try:
+                api.on_dataset_download(
+                    dataset_name=download_dataset, namespace=namespace)
+            except Exception as e:
+                logger.error(e)
 
         return MsDataset.from_hf_dataset(dataset, target=target)
 
@@ -396,8 +406,8 @@ class MsDataset:
             )
         if isinstance(self._hf_ds, ExternalDataset):
             task_data_config.update({'preprocessor': preprocessors})
-            return build_task_dataset(task_data_config, task_name,
-                                      self._hf_ds.config_kwargs)
+            task_data_config.update(self._hf_ds.config_kwargs)
+            return build_task_dataset(task_data_config, task_name)
         if preprocessors is not None:
             return self.to_torch_dataset_with_processors(
                 preprocessors, columns=columns)
@@ -588,7 +598,7 @@ class MsDataset:
         """Clone meta-file of dataset from the ModelScope Hub.
         Args:
             dataset_work_dir (str): Current git working directory.
-            dataset_id (str): Dataset id, It should be like your-namespace/your-dataset-name .
+            dataset_id (str): Dataset id, in the form of your-namespace/your-dataset-name .
             revision(`Optional[str]`):
                 revision of the model you want to clone from. Can be any of a branch, tag or commit hash
             auth_token(`Optional[str]`):
@@ -610,11 +620,11 @@ class MsDataset:
         if clone_work_dir:
             logger.info('Already cloned repo to: {}'.format(clone_work_dir))
         else:
-            logger.warning('The repo working dir is already ex.')
+            logger.warning(
+                'Repo dir already exists: {}'.format(clone_work_dir))
 
     @staticmethod
     def upload_meta(dataset_work_dir: str,
-                    dataset_id: str,
                     commit_message: str,
                     revision: Optional[str] = DEFAULT_DATASET_REVISION,
                     auth_token: Optional[str] = None,
@@ -624,7 +634,6 @@ class MsDataset:
 
         Args:
             dataset_work_dir (str): Current working directory.
-            dataset_id (str): Dataset id, It should be like your-namespace/your-dataset-name .
             commit_message (str): Commit message.
             revision(`Optional[str]`):
                 revision of the model you want to clone from. Can be any of a branch, tag or commit hash
@@ -641,7 +650,7 @@ class MsDataset:
         """
         _repo = DatasetRepository(
             repo_work_dir=dataset_work_dir,
-            dataset_id=dataset_id,
+            dataset_id='',
             revision=revision,
             auth_token=auth_token,
             git_path=git_path)
diff --git a/modelscope/msdatasets/task_datasets/__init__.py b/modelscope/msdatasets/task_datasets/__init__.py
index f97ff8b2..e2bf5bc1 100644
--- a/modelscope/msdatasets/task_datasets/__init__.py
+++ b/modelscope/msdatasets/task_datasets/__init__.py
@@ -11,12 +11,14 @@ if TYPE_CHECKING:
     from .image_instance_segmentation_coco_dataset import ImageInstanceSegmentationCocoDataset
     from .movie_scene_segmentation import MovieSceneSegmentationDataset
     from .video_summarization_dataset import VideoSummarizationDataset
+    from .passage_ranking_dataset import PassageRankingDataset
 
 else:
     _import_structure = {
         'base': ['TaskDataset'],
         'builder': ['TASK_DATASETS', 'build_task_dataset'],
         'torch_base_dataset': ['TorchTaskDataset'],
+        'passage_ranking_dataset': ['PassageRankingDataset'],
         'veco_dataset': ['VecoDataset'],
         'image_instance_segmentation_coco_dataset':
         ['ImageInstanceSegmentationCocoDataset'],
diff --git a/modelscope/msdatasets/task_datasets/passage_ranking_dataset.py b/modelscope/msdatasets/task_datasets/passage_ranking_dataset.py
new file mode 100644
index 00000000..517e0d36
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/passage_ranking_dataset.py
@@ -0,0 +1,151 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import random
+from dataclasses import dataclass
+from typing import Any, Dict, List, Tuple, Union
+
+import torch
+from datasets import Dataset, IterableDataset, concatenate_datasets
+from torch.utils.data import ConcatDataset
+from transformers import DataCollatorWithPadding
+
+from modelscope.metainfo import Models
+from modelscope.utils.constant import ModeKeys, Tasks
+from .base import TaskDataset
+from .builder import TASK_DATASETS
+from .torch_base_dataset import TorchTaskDataset
+
+
+@TASK_DATASETS.register_module(
+    group_key=Tasks.passage_ranking, module_name=Models.bert)
+class PassageRankingDataset(TorchTaskDataset):
+
+    def __init__(self,
+                 datasets: Union[Any, List[Any]],
+                 mode,
+                 preprocessor=None,
+                 *args,
+                 **kwargs):
+        self.seed = kwargs.get('seed', 42)
+        self.permutation = None
+        self.datasets = None
+        self.dataset_config = kwargs
+        self.query_sequence = self.dataset_config.get('query_sequence',
+                                                      'query')
+        self.pos_sequence = self.dataset_config.get('pos_sequence',
+                                                    'positive_passages')
+        self.neg_sequence = self.dataset_config.get('neg_sequence',
+                                                    'negative_passages')
+        self.passage_text_fileds = self.dataset_config.get(
+            'passage_text_fileds', ['title', 'text'])
+        self.qid_field = self.dataset_config.get('qid_field', 'query_id')
+        if mode == ModeKeys.TRAIN:
+            train_config = kwargs.get('train', {})
+            self.neg_samples = train_config.get('neg_samples', 4)
+
+        super().__init__(datasets, mode, preprocessor, **kwargs)
+
+    def __getitem__(self, index) -> Any:
+        if self.mode == ModeKeys.TRAIN:
+            return self.__get_train_item__(index)
+        else:
+            return self.__get_test_item__(index)
+
+    def __get_test_item__(self, index):
+        group = self._inner_dataset[index]
+        labels = []
+
+        qry = group[self.query_sequence]
+
+        pos_sequences = group[self.pos_sequence]
+        pos_sequences = [
+            ' '.join([ele[key] for key in self.passage_text_fileds])
+            for ele in pos_sequences
+        ]
+        labels.extend([1] * len(pos_sequences))
+
+        neg_sequences = group[self.neg_sequence]
+        neg_sequences = [
+            ' '.join([ele[key] for key in self.passage_text_fileds])
+            for ele in neg_sequences
+        ]
+
+        labels.extend([0] * len(neg_sequences))
+        qid = group[self.qid_field]
+
+        examples = pos_sequences + neg_sequences
+        sample = {
+            'qid': torch.LongTensor([int(qid)] * len(labels)),
+            self.preprocessor.first_sequence: qry,
+            self.preprocessor.second_sequence: examples,
+            'labels': torch.LongTensor(labels)
+        }
+        return self.prepare_sample(sample)
+
+    def __get_train_item__(self, index):
+        group = self._inner_dataset[index]
+
+        qry = group[self.query_sequence]
+
+        pos_sequences = group[self.pos_sequence]
+        pos_sequences = [
+            ' '.join([ele[key] for key in self.passage_text_fileds])
+            for ele in pos_sequences
+        ]
+
+        neg_sequences = group[self.neg_sequence]
+        neg_sequences = [
+            ' '.join([ele[key] for key in self.passage_text_fileds])
+            for ele in neg_sequences
+        ]
+
+        pos_psg = random.choice(pos_sequences)
+
+        if len(neg_sequences) < self.neg_samples:
+            negs = random.choices(neg_sequences, k=self.neg_samples)
+        else:
+            negs = random.sample(neg_sequences, k=self.neg_samples)
+        examples = [pos_psg] + negs
+        sample = {
+            self.preprocessor.first_sequence: qry,
+            self.preprocessor.second_sequence: examples,
+        }
+        return self.prepare_sample(sample)
+
+    def __len__(self):
+        return len(self._inner_dataset)
+
+    def prepare_dataset(self, datasets: Union[Any, List[Any]]) -> Any:
+        """Prepare a dataset.
+
+        User can process the input datasets in a whole dataset perspective.
+        This method gives a default implementation of datasets merging, user can override this
+        method to write custom logics.
+
+        Args:
+            datasets: The original dataset(s)
+
+        Returns: A single dataset, which may be created after merging.
+
+        """
+        if isinstance(datasets, List):
+            if len(datasets) == 1:
+                return datasets[0]
+            elif len(datasets) > 1:
+                return ConcatDataset(datasets)
+        else:
+            return datasets
+
+    def prepare_sample(self, data):
+        """Preprocess the data fetched from the inner_dataset.
+
+        If the preprocessor is None, the original data will be returned, else the preprocessor will be called.
+        User can override this method to implement custom logics.
+
+        Args:
+            data: The data fetched from the dataset.
+
+        Returns: The processed data.
+
+        """
+        return self.preprocessor(
+            data) if self.preprocessor is not None else data
diff --git a/modelscope/msdatasets/utils/dataset_builder.py b/modelscope/msdatasets/utils/dataset_builder.py
index 825400c4..0548f7b9 100644
--- a/modelscope/msdatasets/utils/dataset_builder.py
+++ b/modelscope/msdatasets/utils/dataset_builder.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 from typing import Mapping, Sequence, Union
 
diff --git a/modelscope/msdatasets/utils/dataset_utils.py b/modelscope/msdatasets/utils/dataset_utils.py
index 769bed93..ef42f75f 100644
--- a/modelscope/msdatasets/utils/dataset_utils.py
+++ b/modelscope/msdatasets/utils/dataset_utils.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 from collections import defaultdict
 from typing import Any, Mapping, Optional, Sequence, Union
diff --git a/modelscope/msdatasets/utils/download_utils.py b/modelscope/msdatasets/utils/download_utils.py
index eb1c99ef..2e21bf50 100644
--- a/modelscope/msdatasets/utils/download_utils.py
+++ b/modelscope/msdatasets/utils/download_utils.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Optional
 
 from datasets.utils.download_manager import DownloadConfig, DownloadManager
diff --git a/modelscope/msdatasets/utils/oss_utils.py b/modelscope/msdatasets/utils/oss_utils.py
index 9a7040a1..4a403876 100644
--- a/modelscope/msdatasets/utils/oss_utils.py
+++ b/modelscope/msdatasets/utils/oss_utils.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from __future__ import print_function
 import os
 
diff --git a/modelscope/msdatasets/utils/upload_utils.py b/modelscope/msdatasets/utils/upload_utils.py
index fbe5c531..4813b89f 100644
--- a/modelscope/msdatasets/utils/upload_utils.py
+++ b/modelscope/msdatasets/utils/upload_utils.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from .oss_utils import OssUtilities
 
 
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index c6a7a619..b3eb9ad8 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -35,6 +35,7 @@ class OutputKeys(object):
     UUID = 'uuid'
     WORD = 'word'
     KWS_LIST = 'kws_list'
+    HISTORY = 'history'
     TIMESTAMPS = 'timestamps'
     SPLIT_VIDEO_NUM = 'split_video_num'
     SPLIT_META_DICT = 'split_meta_dict'
@@ -201,9 +202,9 @@ TASK_OUTPUTS = {
     #               [[score]*15]
     #              ]
     #   "boxes": [
-    #               [[x1, y1], [x2, y2]],
-    #               [[x1, y1], [x2, y2]],
-    #               [[x1, y1], [x2, y2]],
+    #               [x1, y1, x2, y2],
+    #               [x1, y1, x2, y2],
+    #               [x1, y1, x2, y2],
     #             ]
     # }
     Tasks.body_2d_keypoints:
@@ -359,26 +360,20 @@ TASK_OUTPUTS = {
     # word segmentation result for single sample
     # {
     #   "output": "今天 天气 不错 ， 适合 出去 游玩"
-    # }
-    Tasks.word_segmentation: [OutputKeys.OUTPUT],
-
-    # part-of-speech result for single sample
-    # [
-    #     {'word': '诸葛', 'label': 'PROPN'},
-    #     {'word': '亮', 'label': 'PROPN'},
-    #     {'word': '发明', 'label': 'VERB'},
-    #     {'word': '八', 'label': 'NUM'},
-    #     {'word': '阵', 'label': 'NOUN'},
-    #     {'word': '图', 'label': 'PART'},
-    #     {'word': '以', 'label': 'ADV'},
-    #     {'word': '利', 'label': 'VERB'},
-    #     {'word': '立营', 'label': 'VERB'},
-    #     {'word': '练兵', 'label': 'VERB'},
-    #     {'word': '.', 'label': 'PUNCT'}
+    #   "labels": [
+    #     {'word': '今天', 'label': 'PROPN'},
+    #     {'word': '天气', 'label': 'PROPN'},
+    #     {'word': '不错', 'label': 'VERB'},
+    #     {'word': ',', 'label': 'NUM'},
+    #     {'word': '适合', 'label': 'NOUN'},
+    #     {'word': '出去', 'label': 'PART'},
+    #     {'word': '游玩', 'label': 'ADV'},
     # ]
-    # TODO @wenmeng.zwm support list of result check
-    Tasks.part_of_speech: [OutputKeys.WORD, OutputKeys.LABEL],
+    # }
+    Tasks.word_segmentation: [OutputKeys.OUTPUT, OutputKeys.LABELS],
+    Tasks.part_of_speech: [OutputKeys.OUTPUT, OutputKeys.LABELS],
 
+    # TODO @wenmeng.zwm support list of result check
     # named entity recognition result for single sample
     # {
     #   "output": [
@@ -393,19 +388,14 @@ TASK_OUTPUTS = {
     #    "output": "我想吃苹果"
     # }
     Tasks.text_error_correction: [OutputKeys.OUTPUT],
-
+    Tasks.sentence_embedding: [OutputKeys.TEXT_EMBEDDING, OutputKeys.SCORES],
+    Tasks.passage_ranking: [OutputKeys.SCORES],
     # text generation result for single sample
     # {
     #   "text": "this is the text generated by a model."
     # }
     Tasks.text_generation: [OutputKeys.TEXT],
 
-    # text feature extraction for single sample
-    # {
-    #   "text_embedding": np.array with shape [1, D]
-    # }
-    Tasks.sentence_embedding: [OutputKeys.TEXT_EMBEDDING],
-
     # fill mask result for single sample
     # {
     #   "text": "this is the text which masks filled by model."
@@ -482,6 +472,13 @@ TASK_OUTPUTS = {
     # }
     Tasks.conversational_text_to_sql: [OutputKeys.TEXT],
 
+    # table-question-answering result for single sample
+    # {
+    #   "sql": "SELECT shop.Name FROM shop."
+    #   "sql_history": {sel: 0, agg: 0, conds: [[0, 0, 'val']]}
+    # }
+    Tasks.table_question_answering: [OutputKeys.OUTPUT, OutputKeys.HISTORY],
+
     # ============ audio tasks ===================
     # asr result for single sample
     # { "text": "每一天都要快乐喔"}
@@ -616,4 +613,9 @@ TASK_OUTPUTS = {
     #       "img_embedding": np.array with shape [1, D],
     #   }
     Tasks.image_reid_person: [OutputKeys.IMG_EMBEDDING],
+
+    # {
+    #     'output': ['Done' / 'Decode_Error']
+    # }
+    Tasks.video_inpainting: [OutputKeys.OUTPUT]
 }
diff --git a/modelscope/pipelines/audio/ans_pipeline.py b/modelscope/pipelines/audio/ans_pipeline.py
index 5ed4d769..e55f613e 100644
--- a/modelscope/pipelines/audio/ans_pipeline.py
+++ b/modelscope/pipelines/audio/ans_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import io
 from typing import Any, Dict
 
@@ -6,6 +8,7 @@ import numpy as np
 import soundfile as sf
 import torch
 
+from modelscope.fileio import File
 from modelscope.metainfo import Pipelines
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, Pipeline
@@ -34,11 +37,12 @@ class ANSPipeline(Pipeline):
         super().__init__(model=model, **kwargs)
         self.model.eval()
 
-    def preprocess(self, inputs: Input) -> Dict[str, Any]:
+    def preprocess(self, inputs: Input, **preprocess_params) -> Dict[str, Any]:
         if isinstance(inputs, bytes):
             data1, fs = sf.read(io.BytesIO(inputs))
         elif isinstance(inputs, str):
-            data1, fs = sf.read(inputs)
+            file_bytes = File.read(inputs)
+            data1, fs = sf.read(io.BytesIO(file_bytes))
         else:
             raise TypeError(f'Unsupported type {type(inputs)}.')
         if len(data1.shape) > 1:
@@ -50,7 +54,8 @@ class ANSPipeline(Pipeline):
         inputs = np.reshape(data, [1, data.shape[0]])
         return {'ndarray': inputs, 'nsamples': data.shape[0]}
 
-    def forward(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
         ndarray = inputs['ndarray']
         if isinstance(ndarray, torch.Tensor):
             ndarray = ndarray.cpu().numpy()
diff --git a/modelscope/pipelines/audio/asr_inference_pipeline.py b/modelscope/pipelines/audio/asr_inference_pipeline.py
index b321b770..282d1184 100644
--- a/modelscope/pipelines/audio/asr_inference_pipeline.py
+++ b/modelscope/pipelines/audio/asr_inference_pipeline.py
@@ -1,4 +1,3 @@
-import os
 from typing import Any, Dict, List, Sequence, Tuple, Union
 
 import yaml
@@ -9,6 +8,8 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import WavToScp
+from modelscope.utils.audio.audio_utils import (extract_pcm_from_wav,
+                                                load_bytes_from_url)
 from modelscope.utils.constant import Frameworks, Tasks
 from modelscope.utils.logger import get_logger
 
@@ -41,12 +42,20 @@ class AutomaticSpeechRecognitionPipeline(Pipeline):
 
         self.recog_type = recog_type
         self.audio_format = audio_format
-        self.audio_in = audio_in
         self.audio_fs = audio_fs
 
+        if isinstance(audio_in, str):
+            # load pcm data from url if audio_in is url str
+            self.audio_in = load_bytes_from_url(audio_in)
+        elif isinstance(audio_in, bytes):
+            # load pcm data from wav data if audio_in is wave format
+            self.audio_in = extract_pcm_from_wav(audio_in)
+        else:
+            self.audio_in = audio_in
+
         if recog_type is None or audio_format is None:
             self.recog_type, self.audio_format, self.audio_in = asr_utils.type_checking(
-                audio_in=audio_in,
+                audio_in=self.audio_in,
                 recog_type=recog_type,
                 audio_format=audio_format)
 
diff --git a/modelscope/pipelines/audio/kws_farfield_pipeline.py b/modelscope/pipelines/audio/kws_farfield_pipeline.py
index a114e7fb..62f58fee 100644
--- a/modelscope/pipelines/audio/kws_farfield_pipeline.py
+++ b/modelscope/pipelines/audio/kws_farfield_pipeline.py
@@ -1,7 +1,10 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import io
 import wave
 from typing import Any, Dict
 
+from modelscope.fileio import File
 from modelscope.metainfo import Pipelines
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, Pipeline
@@ -39,6 +42,8 @@ class KWSFarfieldPipeline(Pipeline):
     def preprocess(self, inputs: Input, **preprocess_params) -> Dict[str, Any]:
         if isinstance(inputs, bytes):
             return dict(input_file=inputs)
+        elif isinstance(inputs, str):
+            return dict(input_file=inputs)
         elif isinstance(inputs, Dict):
             return inputs
         else:
@@ -47,6 +52,8 @@ class KWSFarfieldPipeline(Pipeline):
     def forward(self, inputs: Dict[str, Any],
                 **forward_params) -> Dict[str, Any]:
         input_file = inputs['input_file']
+        if isinstance(input_file, str):
+            input_file = File.read(input_file)
         if isinstance(input_file, bytes):
             input_file = io.BytesIO(input_file)
         self.frame_count = 0
diff --git a/modelscope/pipelines/audio/kws_kwsbp_pipeline.py b/modelscope/pipelines/audio/kws_kwsbp_pipeline.py
index 1f31766a..866b8d0b 100644
--- a/modelscope/pipelines/audio/kws_kwsbp_pipeline.py
+++ b/modelscope/pipelines/audio/kws_kwsbp_pipeline.py
@@ -8,6 +8,8 @@ from modelscope.models import Model
 from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import WavToLists
+from modelscope.utils.audio.audio_utils import (extract_pcm_from_wav,
+                                                load_bytes_from_url)
 from modelscope.utils.constant import Tasks
 from modelscope.utils.logger import get_logger
 
@@ -40,6 +42,13 @@ class KeyWordSpottingKwsbpPipeline(Pipeline):
         if self.preprocessor is None:
             self.preprocessor = WavToLists()
 
+        if isinstance(audio_in, str):
+            # load pcm data from url if audio_in is url str
+            audio_in = load_bytes_from_url(audio_in)
+        elif isinstance(audio_in, bytes):
+            # load pcm data from wav data if audio_in is wave format
+            audio_in = extract_pcm_from_wav(audio_in)
+
         output = self.preprocessor.forward(self.model.forward(), audio_in)
         output = self.forward(output)
         rst = self.postprocess(output)
diff --git a/modelscope/pipelines/audio/linear_aec_pipeline.py b/modelscope/pipelines/audio/linear_aec_pipeline.py
index b59bc475..e1e75ddb 100644
--- a/modelscope/pipelines/audio/linear_aec_pipeline.py
+++ b/modelscope/pipelines/audio/linear_aec_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import importlib
 import os
 from typing import Any, Dict
@@ -51,7 +53,7 @@ class LinearAECPipeline(Pipeline):
 
     When invoke the class with pipeline.__call__(), you should provide two params:
         Dict[str, Any]
-            the path of wav files，eg:{
+            the path of wav files, eg:{
             "nearend_mic": "/your/data/near_end_mic_audio.wav",
             "farend_speech": "/your/data/far_end_speech_audio.wav"}
         output_path (str, optional): "/your/output/audio_after_aec.wav"
diff --git a/modelscope/pipelines/base.py b/modelscope/pipelines/base.py
index d4f9c6bf..c5db2b57 100644
--- a/modelscope/pipelines/base.py
+++ b/modelscope/pipelines/base.py
@@ -1,7 +1,10 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
+import os
 import os.path as osp
 from abc import ABC, abstractmethod
+from functools import partial
+from multiprocessing import Pool
 from threading import Lock
 from typing import Any, Dict, Generator, List, Mapping, Union
 
@@ -15,15 +18,17 @@ from modelscope.utils.config import Config
 from modelscope.utils.constant import Frameworks, ModelFile
 from modelscope.utils.device import (create_device, device_placement,
                                      verify_device)
+from modelscope.utils.hub import read_config, snapshot_download
 from modelscope.utils.import_utils import is_tf_available, is_torch_available
 from modelscope.utils.logger import get_logger
+from modelscope.utils.torch_utils import _find_free_port, _is_free_port
 from .util import is_model, is_official_hub_path
 
 if is_torch_available():
     import torch
 
 if is_tf_available():
-    import tensorflow as tf
+    pass
 
 Tensor = Union['torch.Tensor', 'tf.Tensor']
 Input = Union[str, tuple, MsDataset, 'Image.Image', 'numpy.ndarray']
@@ -199,44 +204,7 @@ class Pipeline(ABC):
             yield self._process_single(ele, *args, **kwargs)
 
     def _collate_fn(self, data):
-        """Prepare the input just before the forward function.
-        This method will move the tensors to the right device.
-        Usually this method does not need to be overridden.
-
-        Args:
-            data: The data out of the dataloader.
-
-        Returns: The processed data.
-
-        """
-        from torch.utils.data.dataloader import default_collate
-        from modelscope.preprocessors import InputFeatures
-        if isinstance(data, dict) or isinstance(data, Mapping):
-            return type(data)(
-                {k: self._collate_fn(v)
-                 for k, v in data.items()})
-        elif isinstance(data, (tuple, list)):
-            if isinstance(data[0], (int, float)):
-                return default_collate(data).to(self.device)
-            else:
-                return type(data)(self._collate_fn(v) for v in data)
-        elif isinstance(data, np.ndarray):
-            if data.dtype.type is np.str_:
-                return data
-            else:
-                return self._collate_fn(torch.from_numpy(data))
-        elif isinstance(data, torch.Tensor):
-            return data.to(self.device)
-        elif isinstance(data, (bytes, str, int, float, bool, type(None))):
-            return data
-        elif isinstance(data, InputFeatures):
-            return data
-        else:
-            import mmcv
-            if isinstance(data, mmcv.parallel.data_container.DataContainer):
-                return data
-            else:
-                raise ValueError(f'Unsupported data type {type(data)}')
+        return collate_fn(data, self.device)
 
     def _process_single(self, input: Input, *args, **kwargs) -> Dict[str, Any]:
         preprocess_params = kwargs.get('preprocess_params', {})
@@ -302,3 +270,146 @@ class Pipeline(ABC):
                 output should have the standard output name.
         """
         raise NotImplementedError('postprocess')
+
+
+class DistributedPipeline(Pipeline):
+    """This pipeline is used to load multi gpu models.
+
+    What will this class do:
+    1. Read the global config from the configuration.json
+    2. Set the multiprocessing method to spawn
+    3. Open a multiprocessing pool of the world_size to instantiate model pieces.
+    4. Set the master port and ip
+    5. Call _instantiate_one to instantiate one model piece
+        This method should be implemented by the derived class.
+    6. After the forward method is called, do preprocess in main process
+        and call _forward_one to collect results, and do
+        post process in main process.
+
+    NOTE: _instantiate_one and _forward_one are class methods, any derived class should implement them and
+    store the model handler in the class field.
+    """
+
+    def __init__(self,
+                 model: str = None,
+                 preprocessor: Union[Preprocessor, List[Preprocessor]] = None,
+                 auto_collate=True,
+                 **kwargs):
+        self.preprocessor = preprocessor
+        self._model_prepare = False
+        self._model_prepare_lock = Lock()
+        self._auto_collate = auto_collate
+
+        if os.path.exists(model):
+            self.model_dir = model
+        else:
+            self.model_dir = snapshot_download(model)
+        self.cfg = read_config(self.model_dir)
+        self.world_size = self.cfg.model.world_size
+        self.model_pool = None
+        self.device_name = 'cpu'
+        self.device = create_device(self.device_name)
+        self.has_multiple_models = False
+        self.framework = self.cfg.framework
+        if torch.multiprocessing.get_start_method(allow_none=True) is None:
+            torch.multiprocessing.set_start_method('spawn')
+
+        ranks = list(range(self.world_size))
+        self.model_pool = Pool(self.world_size)
+        master_ip = '127.0.0.1' if 'master_ip' not in kwargs else kwargs[
+            'master_ip']
+        master_port = '29500' if 'master_port' not in kwargs else kwargs[
+            'master_port']
+        if not _is_free_port(int(master_port)):
+            master_port = str(_find_free_port())
+        self.model_pool.map(
+            partial(
+                self.__class__._instantiate_one,
+                model_dir=self.model_dir,
+                master_ip=master_ip,
+                master_port=master_port,
+                **self.cfg.model,
+                **kwargs), ranks)
+
+    def __del__(self):
+        if hasattr(self, 'model_pool') and self.model_pool is not None:
+            self.model_pool.terminate()
+
+    def __getstate__(self):
+        self_dict = self.__dict__.copy()
+        del self_dict['model_pool']
+        del self_dict['preprocessor']
+        del self_dict['_model_prepare_lock']
+        return self_dict
+
+    @classmethod
+    def _instantiate_one(cls, rank, model_dir, **kwargs):
+        """Instantiate one model piece.
+
+        @param rank: The model rank.
+        @param model_dir: The model_dir in the node.
+        @param kwargs: Any extra args.
+        @return: None. The model handler should be kept in the class field.
+        """
+        pass
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        inputs = {
+            'inputs': inputs,
+            'forward_params': forward_params,
+        }
+        res = self.model_pool.map(self.__class__._forward_one,
+                                  [inputs] * self.world_size)
+        return res[0]
+
+    @classmethod
+    def _forward_one(cls, inputs):
+        """Forward the inputs to one model piece.
+
+        Use the model handler kept in the class field to forward.
+
+        @param inputs: The inputs after the preprocessing.
+        @return: The forward results.
+        """
+        pass
+
+
+def collate_fn(data, device):
+    """Prepare the input just before the forward function.
+    This method will move the tensors to the right device.
+    Usually this method does not need to be overridden.
+
+    Args:
+        data: The data out of the dataloader.
+        device: The device to move data to.
+
+    Returns: The processed data.
+
+    """
+    from torch.utils.data.dataloader import default_collate
+    from modelscope.preprocessors import InputFeatures
+    if isinstance(data, dict) or isinstance(data, Mapping):
+        return type(data)({k: collate_fn(v, device) for k, v in data.items()})
+    elif isinstance(data, (tuple, list)):
+        if isinstance(data[0], (int, float)):
+            return default_collate(data).to(device)
+        else:
+            return type(data)(collate_fn(v, device) for v in data)
+    elif isinstance(data, np.ndarray):
+        if data.dtype.type is np.str_:
+            return data
+        else:
+            return collate_fn(torch.from_numpy(data), device)
+    elif isinstance(data, torch.Tensor):
+        return data.to(device)
+    elif isinstance(data, (bytes, str, int, float, bool, type(None))):
+        return data
+    elif isinstance(data, InputFeatures):
+        return data
+    else:
+        import mmcv
+        if isinstance(data, mmcv.parallel.data_container.DataContainer):
+            return data
+        else:
+            raise ValueError(f'Unsupported data type {type(data)}')
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index 9f265fb8..5e244b27 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -17,9 +17,17 @@ PIPELINES = Registry('pipelines')
 
 DEFAULT_MODEL_FOR_PIPELINE = {
     # TaskName: (pipeline_module_name, model_repo)
+    Tasks.sentence_embedding:
+    (Pipelines.sentence_embedding,
+     'damo/nlp_corom_sentence-embedding_english-base'),
+    Tasks.passage_ranking: (Pipelines.passage_ranking,
+                            'damo/nlp_corom_passage-ranking_english-base'),
     Tasks.word_segmentation:
     (Pipelines.word_segmentation,
      'damo/nlp_structbert_word-segmentation_chinese-base'),
+    Tasks.token_classification:
+    (Pipelines.part_of_speech,
+     'damo/nlp_structbert_part-of-speech_chinese-base'),
     Tasks.named_entity_recognition:
     (Pipelines.named_entity_recognition,
      'damo/nlp_raner_named-entity-recognition_chinese-base-news'),
@@ -58,6 +66,9 @@ DEFAULT_MODEL_FOR_PIPELINE = {
     Tasks.conversational_text_to_sql:
     (Pipelines.conversational_text_to_sql,
      'damo/nlp_star_conversational-text-to-sql'),
+    Tasks.table_question_answering:
+    (Pipelines.table_question_answering_pipeline,
+     'damo/nlp-convai-text2sql-pretrain-cn'),
     Tasks.text_error_correction:
     (Pipelines.text_error_correction,
      'damo/nlp_bart_text-error-correction_chinese'),
@@ -165,6 +176,8 @@ DEFAULT_MODEL_FOR_PIPELINE = {
      'damo/cv_resnet50-bert_video-scene-segmentation_movienet'),
     Tasks.shop_segmentation: (Pipelines.shop_segmentation,
                               'damo/cv_vitb16_segmentation_shop-seg'),
+    Tasks.video_inpainting: (Pipelines.video_inpainting,
+                             'damo/cv_video-inpainting'),
 }
 
 
diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py
index 72a225ff..a9dc05f2 100644
--- a/modelscope/pipelines/cv/__init__.py
+++ b/modelscope/pipelines/cv/__init__.py
@@ -46,10 +46,13 @@ if TYPE_CHECKING:
     from .virtual_try_on_pipeline import VirtualTryonPipeline
     from .shop_segmentation_pipleline import ShopSegmentationPipeline
     from .easycv_pipelines import EasyCVDetectionPipeline, EasyCVSegmentationPipeline, Face2DKeypointsPipeline
-    from .text_driven_segmentation_pipleline import TextDrivenSegmentationPipleline
+    from .text_driven_segmentation_pipleline import TextDrivenSegmentationPipeline
     from .movie_scene_segmentation_pipeline import MovieSceneSegmentationPipeline
+    from .mog_face_detection_pipeline import MogFaceDetectionPipeline
+    from .ulfd_face_detection_pipeline import UlfdFaceDetectionPipeline
     from .retina_face_detection_pipeline import RetinaFaceDetectionPipeline
     from .facial_expression_recognition_pipeline import FacialExpressionRecognitionPipeline
+    from .mtcnn_face_detection_pipeline import MtcnnFaceDetectionPipeline
 
 else:
     _import_structure = {
@@ -110,9 +113,12 @@ else:
         ['TextDrivenSegmentationPipeline'],
         'movie_scene_segmentation_pipeline':
         ['MovieSceneSegmentationPipeline'],
+        'mog_face_detection_pipeline': ['MogFaceDetectionPipeline'],
+        'ulfd_face_detection_pipeline': ['UlfdFaceDetectionPipeline'],
         'retina_face_detection_pipeline': ['RetinaFaceDetectionPipeline'],
         'facial_expression_recognition_pipelin':
-        ['FacialExpressionRecognitionPipeline']
+        ['FacialExpressionRecognitionPipeline'],
+        'mtcnn_face_detection_pipeline': ['MtcnnFaceDetectionPipeline'],
     }
 
     import sys
diff --git a/modelscope/pipelines/cv/action_recognition_pipeline.py b/modelscope/pipelines/cv/action_recognition_pipeline.py
index e3400ea7..7f1a46b2 100644
--- a/modelscope/pipelines/cv/action_recognition_pipeline.py
+++ b/modelscope/pipelines/cv/action_recognition_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import math
 import os.path as osp
 from typing import Any, Dict
diff --git a/modelscope/pipelines/cv/body_2d_keypoints_pipeline.py b/modelscope/pipelines/cv/body_2d_keypoints_pipeline.py
index f9ae4b2c..c6a05195 100644
--- a/modelscope/pipelines/cv/body_2d_keypoints_pipeline.py
+++ b/modelscope/pipelines/cv/body_2d_keypoints_pipeline.py
@@ -76,8 +76,11 @@ class Body2DKeypointsPipeline(Pipeline):
             }
 
         poses, scores, boxes = self.keypoint_model.postprocess(input)
+        result_boxes = []
+        for box in boxes:
+            result_boxes.append([box[0][0], box[0][1], box[1][0], box[1][1]])
         return {
-            OutputKeys.BOXES: boxes,
+            OutputKeys.BOXES: result_boxes,
             OutputKeys.POSES: poses,
             OutputKeys.SCORES: scores
         }
diff --git a/modelscope/pipelines/cv/face_detection_pipeline.py b/modelscope/pipelines/cv/face_detection_pipeline.py
index 8fda5b46..eff5b70f 100644
--- a/modelscope/pipelines/cv/face_detection_pipeline.py
+++ b/modelscope/pipelines/cv/face_detection_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
 from typing import Any, Dict
 
diff --git a/modelscope/pipelines/cv/face_recognition_pipeline.py b/modelscope/pipelines/cv/face_recognition_pipeline.py
index 506346df..873e4a1f 100644
--- a/modelscope/pipelines/cv/face_recognition_pipeline.py
+++ b/modelscope/pipelines/cv/face_recognition_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
 from typing import Any, Dict
 
diff --git a/modelscope/pipelines/cv/image_cartoon_pipeline.py b/modelscope/pipelines/cv/image_cartoon_pipeline.py
index eb669354..f34be618 100644
--- a/modelscope/pipelines/cv/image_cartoon_pipeline.py
+++ b/modelscope/pipelines/cv/image_cartoon_pipeline.py
@@ -40,11 +40,9 @@ class ImageCartoonPipeline(Pipeline):
         with device_placement(self.framework, self.device_name):
             self.facer = FaceAna(self.model)
             self.sess_anime_head = self.load_sess(
-                os.path.join(self.model, 'cartoon_anime_h.pb'),
-                'model_anime_head')
+                os.path.join(self.model, 'cartoon_h.pb'), 'model_anime_head')
             self.sess_anime_bg = self.load_sess(
-                os.path.join(self.model, 'cartoon_anime_bg.pb'),
-                'model_anime_bg')
+                os.path.join(self.model, 'cartoon_bg.pb'), 'model_anime_bg')
 
         self.box_width = 288
         global_mask = cv2.imread(os.path.join(self.model, 'alpha.jpg'))
diff --git a/modelscope/pipelines/cv/image_reid_person_pipeline.py b/modelscope/pipelines/cv/image_reid_person_pipeline.py
index a14666a1..64674a65 100644
--- a/modelscope/pipelines/cv/image_reid_person_pipeline.py
+++ b/modelscope/pipelines/cv/image_reid_person_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import math
 import os
 from typing import Any, Dict
diff --git a/modelscope/pipelines/cv/mog_face_detection_pipeline.py b/modelscope/pipelines/cv/mog_face_detection_pipeline.py
new file mode 100644
index 00000000..8797ad12
--- /dev/null
+++ b/modelscope/pipelines/cv/mog_face_detection_pipeline.py
@@ -0,0 +1,54 @@
+import os.path as osp
+from typing import Any, Dict
+
+import numpy as np
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.face_detection import MogFaceDetector
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.face_detection, module_name=Pipelines.mog_face_detection)
+class MogFaceDetectionPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a face detection pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+        ckpt_path = osp.join(model, ModelFile.TORCH_MODEL_FILE)
+        logger.info(f'loading model from {ckpt_path}')
+        detector = MogFaceDetector(model_path=ckpt_path, device=self.device)
+        self.detector = detector
+        logger.info('load model done')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        img = LoadImage.convert_to_ndarray(input)
+        img = img.astype(np.float32)
+        result = {'img': img}
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+
+        result = self.detector(input)
+        assert result is not None
+        bboxes = result[:, :4].tolist()
+        scores = result[:, 4].tolist()
+        return {
+            OutputKeys.SCORES: scores,
+            OutputKeys.BOXES: bboxes,
+            OutputKeys.KEYPOINTS: None,
+        }
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/pipelines/cv/mtcnn_face_detection_pipeline.py b/modelscope/pipelines/cv/mtcnn_face_detection_pipeline.py
new file mode 100644
index 00000000..57bf9920
--- /dev/null
+++ b/modelscope/pipelines/cv/mtcnn_face_detection_pipeline.py
@@ -0,0 +1,56 @@
+import os.path as osp
+from typing import Any, Dict
+
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.face_detection import MtcnnFaceDetector
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.face_detection, module_name=Pipelines.mtcnn_face_detection)
+class MtcnnFaceDetectionPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a face detection pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+        ckpt_path = osp.join(model, './weights')
+        logger.info(f'loading model from {ckpt_path}')
+        device = torch.device(
+            f'cuda:{0}' if torch.cuda.is_available() else 'cpu')
+        detector = MtcnnFaceDetector(model_path=ckpt_path, device=device)
+        self.detector = detector
+        self.device = device
+        logger.info('load model done')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        img = LoadImage.convert_to_ndarray(input)
+        result = {'img': img}
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        result = self.detector(input)
+        assert result is not None
+        bboxes = result[0][:, :4].tolist()
+        scores = result[0][:, 4].tolist()
+        lms = result[1].tolist()
+        return {
+            OutputKeys.SCORES: scores,
+            OutputKeys.BOXES: bboxes,
+            OutputKeys.KEYPOINTS: lms,
+        }
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/pipelines/cv/ocr_recognition_pipeline.py b/modelscope/pipelines/cv/ocr_recognition_pipeline.py
index 4b095042..c20d020c 100644
--- a/modelscope/pipelines/cv/ocr_recognition_pipeline.py
+++ b/modelscope/pipelines/cv/ocr_recognition_pipeline.py
@@ -91,7 +91,8 @@ class OCRRecognitionPipeline(Pipeline):
                 data.append(mask)
 
         data = torch.FloatTensor(data).view(
-            len(data), 1, IMG_HEIGHT, IMG_WIDTH).cuda() / 255.
+            len(data), 1, IMG_HEIGHT, IMG_WIDTH) / 255.
+        data = data.to(self.device)
 
         result = {'img': data}
 
diff --git a/modelscope/pipelines/cv/ulfd_face_detection_pipeline.py b/modelscope/pipelines/cv/ulfd_face_detection_pipeline.py
new file mode 100644
index 00000000..1263082b
--- /dev/null
+++ b/modelscope/pipelines/cv/ulfd_face_detection_pipeline.py
@@ -0,0 +1,56 @@
+import os.path as osp
+from typing import Any, Dict
+
+import cv2
+import numpy as np
+import PIL
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.face_detection import UlfdFaceDetector
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.face_detection, module_name=Pipelines.ulfd_face_detection)
+class UlfdFaceDetectionPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a face detection pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+        ckpt_path = osp.join(model, ModelFile.TORCH_MODEL_FILE)
+        logger.info(f'loading model from {ckpt_path}')
+        detector = UlfdFaceDetector(model_path=ckpt_path, device=self.device)
+        self.detector = detector
+        logger.info('load model done')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        img = LoadImage.convert_to_ndarray(input)
+        img = img.astype(np.float32)
+        result = {'img': img}
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        result = self.detector(input)
+        assert result is not None
+        bboxes = result[0].tolist()
+        scores = result[1].tolist()
+        return {
+            OutputKeys.SCORES: scores,
+            OutputKeys.BOXES: bboxes,
+            OutputKeys.KEYPOINTS: None,
+        }
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/pipelines/cv/video_inpainting_pipeline.py b/modelscope/pipelines/cv/video_inpainting_pipeline.py
new file mode 100644
index 00000000..15444e05
--- /dev/null
+++ b/modelscope/pipelines/cv/video_inpainting_pipeline.py
@@ -0,0 +1,47 @@
+from typing import Any, Dict
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.video_inpainting import inpainting
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.video_inpainting, module_name=Pipelines.video_inpainting)
+class VideoInpaintingPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create video inpainting pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+
+        super().__init__(model=model, **kwargs)
+        logger.info('load model done')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        return input
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        decode_error, fps, w, h = inpainting.video_process(
+            input['video_input_path'])
+
+        if decode_error is not None:
+            return {OutputKeys.OUTPUT: 'decode_error'}
+
+        inpainting.inpainting_by_model_balance(self.model,
+                                               input['video_input_path'],
+                                               input['mask_path'],
+                                               input['video_output_path'], fps,
+                                               w, h)
+
+        return {OutputKeys.OUTPUT: 'Done'}
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/pipelines/cv/video_single_object_tracking_pipeline.py b/modelscope/pipelines/cv/video_single_object_tracking_pipeline.py
index f4ba4d0b..c47fc15f 100644
--- a/modelscope/pipelines/cv/video_single_object_tracking_pipeline.py
+++ b/modelscope/pipelines/cv/video_single_object_tracking_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
 from typing import Any, Dict
 
diff --git a/modelscope/pipelines/multi_modal/generative_multi_modal_embedding_pipeline.py b/modelscope/pipelines/multi_modal/generative_multi_modal_embedding_pipeline.py
index f5a180b6..d3b9fef3 100644
--- a/modelscope/pipelines/multi_modal/generative_multi_modal_embedding_pipeline.py
+++ b/modelscope/pipelines/multi_modal/generative_multi_modal_embedding_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Any, Dict
 
 from modelscope.metainfo import Pipelines
diff --git a/modelscope/pipelines/multi_modal/image_captioning_pipeline.py b/modelscope/pipelines/multi_modal/image_captioning_pipeline.py
index 99cccee1..81a5f8cd 100644
--- a/modelscope/pipelines/multi_modal/image_captioning_pipeline.py
+++ b/modelscope/pipelines/multi_modal/image_captioning_pipeline.py
@@ -52,6 +52,4 @@ class ImageCaptioningPipeline(Pipeline):
             return super().forward(inputs, **forward_params)
 
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
-        if isinstance(self.model, OfaForAllTasks):
-            return inputs
-        return {OutputKeys.CAPTION: inputs}
+        return inputs
diff --git a/modelscope/pipelines/multi_modal/image_text_retrieval_pipeline.py b/modelscope/pipelines/multi_modal/image_text_retrieval_pipeline.py
index 1ebcf526..329d79bf 100644
--- a/modelscope/pipelines/multi_modal/image_text_retrieval_pipeline.py
+++ b/modelscope/pipelines/multi_modal/image_text_retrieval_pipeline.py
@@ -48,4 +48,4 @@ class ImageTextRetrievalPipeline(Pipeline):
             return super().forward(inputs, **forward_params)
 
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
-        return {OutputKeys.SCORES: inputs[0].tolist()}
+        return inputs
diff --git a/modelscope/pipelines/multi_modal/multi_modal_embedding_pipeline.py b/modelscope/pipelines/multi_modal/multi_modal_embedding_pipeline.py
index d15970d2..76011be0 100644
--- a/modelscope/pipelines/multi_modal/multi_modal_embedding_pipeline.py
+++ b/modelscope/pipelines/multi_modal/multi_modal_embedding_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Any, Dict
 
 from modelscope.metainfo import Pipelines
diff --git a/modelscope/pipelines/multi_modal/team_multi_modal_similarity_pipeline.py b/modelscope/pipelines/multi_modal/team_multi_modal_similarity_pipeline.py
index 7d3ffed3..fc123e2f 100644
--- a/modelscope/pipelines/multi_modal/team_multi_modal_similarity_pipeline.py
+++ b/modelscope/pipelines/multi_modal/team_multi_modal_similarity_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Any, Dict
 
 from modelscope.metainfo import Pipelines
diff --git a/modelscope/pipelines/multi_modal/text_to_image_synthesis_pipeline.py b/modelscope/pipelines/multi_modal/text_to_image_synthesis_pipeline.py
index 406538cf..7516c5be 100644
--- a/modelscope/pipelines/multi_modal/text_to_image_synthesis_pipeline.py
+++ b/modelscope/pipelines/multi_modal/text_to_image_synthesis_pipeline.py
@@ -1,9 +1,12 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Any, Dict, Optional
 
 import torch
 
 from modelscope.metainfo import Pipelines
-from modelscope.models.multi_modal import OfaForTextToImageSynthesis
+from modelscope.models.multi_modal import (
+    MultiStageDiffusionForTextToImageSynthesis, OfaForTextToImageSynthesis)
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, Model, Pipeline
 from modelscope.pipelines.builder import PIPELINES
@@ -48,7 +51,9 @@ class TextToImageSynthesisPipeline(Pipeline):
             return input
 
     def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
-        if isinstance(self.model, OfaForTextToImageSynthesis):
+        if isinstance(self.model,
+                      (OfaForTextToImageSynthesis,
+                       MultiStageDiffusionForTextToImageSynthesis)):
             return self.model(input)
         return self.model.generate(input)
 
diff --git a/modelscope/pipelines/multi_modal/video_multi_modal_embedding_pipeline.py b/modelscope/pipelines/multi_modal/video_multi_modal_embedding_pipeline.py
index bc697b05..3a9284f1 100644
--- a/modelscope/pipelines/multi_modal/video_multi_modal_embedding_pipeline.py
+++ b/modelscope/pipelines/multi_modal/video_multi_modal_embedding_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Any, Dict
 
 from modelscope.metainfo import Pipelines
diff --git a/modelscope/pipelines/multi_modal/visual_question_answering_pipeline.py b/modelscope/pipelines/multi_modal/visual_question_answering_pipeline.py
index b2442a3e..86177074 100644
--- a/modelscope/pipelines/multi_modal/visual_question_answering_pipeline.py
+++ b/modelscope/pipelines/multi_modal/visual_question_answering_pipeline.py
@@ -56,6 +56,4 @@ class VisualQuestionAnsweringPipeline(Pipeline):
         Returns:
             Dict[str, str]: the prediction results
         """
-        if isinstance(self.model, OfaForAllTasks):
-            return inputs
-        return {OutputKeys.TEXT: inputs}
+        return inputs
diff --git a/modelscope/pipelines/nlp/__init__.py b/modelscope/pipelines/nlp/__init__.py
index 665e016d..b5c53f82 100644
--- a/modelscope/pipelines/nlp/__init__.py
+++ b/modelscope/pipelines/nlp/__init__.py
@@ -5,54 +5,62 @@ from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
     from .conversational_text_to_sql_pipeline import ConversationalTextToSqlPipeline
+    from .table_question_answering_pipeline import TableQuestionAnsweringPipeline
     from .dialog_intent_prediction_pipeline import DialogIntentPredictionPipeline
     from .dialog_modeling_pipeline import DialogModelingPipeline
     from .dialog_state_tracking_pipeline import DialogStateTrackingPipeline
     from .document_segmentation_pipeline import DocumentSegmentationPipeline
+    from .faq_question_answering_pipeline import FaqQuestionAnsweringPipeline
     from .fill_mask_pipeline import FillMaskPipeline
+    from .fill_mask_ponet_pipeline import FillMaskPoNetPreprocessor
     from .information_extraction_pipeline import InformationExtractionPipeline
     from .named_entity_recognition_pipeline import NamedEntityRecognitionPipeline
     from .pair_sentence_classification_pipeline import PairSentenceClassificationPipeline
     from .single_sentence_classification_pipeline import SingleSentenceClassificationPipeline
     from .sequence_classification_pipeline import SequenceClassificationPipeline
+    from .summarization_pipeline import SummarizationPipeline
+    from .text_classification_pipeline import TextClassificationPipeline
+    from .text_error_correction_pipeline import TextErrorCorrectionPipeline
     from .text_generation_pipeline import TextGenerationPipeline
+    from .token_classification_pipeline import TokenClassificationPipeline
     from .translation_pipeline import TranslationPipeline
     from .word_segmentation_pipeline import WordSegmentationPipeline
     from .zero_shot_classification_pipeline import ZeroShotClassificationPipeline
-    from .summarization_pipeline import SummarizationPipeline
-    from .text_classification_pipeline import TextClassificationPipeline
-    from .text_error_correction_pipeline import TextErrorCorrectionPipeline
-    from .faq_question_answering_pipeline import FaqQuestionAnsweringPipeline
-    from .relation_extraction_pipeline import RelationExtractionPipeline
-
+    from .passage_ranking_pipeline import PassageRankingPipeline
+    from .sentence_embedding_pipeline import SentenceEmbeddingPipeline
 else:
     _import_structure = {
         'conversational_text_to_sql_pipeline':
         ['ConversationalTextToSqlPipeline'],
+        'table_question_answering_pipeline':
+        ['TableQuestionAnsweringPipeline'],
         'dialog_intent_prediction_pipeline':
         ['DialogIntentPredictionPipeline'],
         'dialog_modeling_pipeline': ['DialogModelingPipeline'],
         'dialog_state_tracking_pipeline': ['DialogStateTrackingPipeline'],
         'document_segmentation_pipeline': ['DocumentSegmentationPipeline'],
+        'faq_question_answering_pipeline': ['FaqQuestionAnsweringPipeline'],
         'fill_mask_pipeline': ['FillMaskPipeline'],
+        'fill_mask_ponet_pipeline': ['FillMaskPoNetPipeline'],
+        'named_entity_recognition_pipeline':
+        ['NamedEntityRecognitionPipeline'],
         'information_extraction_pipeline': ['InformationExtractionPipeline'],
-        'single_sentence_classification_pipeline':
-        ['SingleSentenceClassificationPipeline'],
         'pair_sentence_classification_pipeline':
         ['PairSentenceClassificationPipeline'],
         'sequence_classification_pipeline': ['SequenceClassificationPipeline'],
+        'single_sentence_classification_pipeline':
+        ['SingleSentenceClassificationPipeline'],
+        'summarization_pipeline': ['SummarizationPipeline'],
+        'text_classification_pipeline': ['TextClassificationPipeline'],
+        'text_error_correction_pipeline': ['TextErrorCorrectionPipeline'],
         'text_generation_pipeline': ['TextGenerationPipeline'],
+        'token_classification_pipeline': ['TokenClassificationPipeline'],
+        'translation_pipeline': ['TranslationPipeline'],
         'word_segmentation_pipeline': ['WordSegmentationPipeline'],
         'zero_shot_classification_pipeline':
         ['ZeroShotClassificationPipeline'],
-        'named_entity_recognition_pipeline':
-        ['NamedEntityRecognitionPipeline'],
-        'translation_pipeline': ['TranslationPipeline'],
-        'summarization_pipeline': ['SummarizationPipeline'],
-        'text_classification_pipeline': ['TextClassificationPipeline'],
-        'text_error_correction_pipeline': ['TextErrorCorrectionPipeline'],
-        'faq_question_answering_pipeline': ['FaqQuestionAnsweringPipeline'],
-        'relation_extraction_pipeline': ['RelationExtractionPipeline']
+        'passage_ranking_pipeline': ['PassageRankingPipeline'],
+        'sentence_embedding_pipeline': ['SentenceEmbeddingPipeline']
     }
 
     import sys
diff --git a/modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py b/modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py
index 399dad5a..c46e8c81 100644
--- a/modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py
+++ b/modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py
@@ -11,8 +11,8 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import ConversationalTextToSqlPreprocessor
-from modelscope.preprocessors.star.fields.common_utils import SubPreprocessor
-from modelscope.preprocessors.star.fields.process_dataset import process_tables
+from modelscope.preprocessors.star.fields import (SubPreprocessor,
+                                                  process_tables)
 from modelscope.utils.constant import Tasks
 
 __all__ = ['ConversationalTextToSqlPipeline']
diff --git a/modelscope/pipelines/nlp/distributed_plug_pipeline.py b/modelscope/pipelines/nlp/distributed_plug_pipeline.py
new file mode 100644
index 00000000..202e6213
--- /dev/null
+++ b/modelscope/pipelines/nlp/distributed_plug_pipeline.py
@@ -0,0 +1,107 @@
+from typing import Any, Dict
+
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.nlp.plug import DistributedPlug
+from modelscope.pipelines.base import DistributedPipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import TextGenerationPreprocessor
+from modelscope.utils.constant import Tasks
+
+
+@PIPELINES.register_module(
+    Tasks.text_generation, module_name=Pipelines.plug_generation)
+class DistributedPlugPipeline(DistributedPipeline):
+    """This class is used to instantiate the plug model.
+    """
+
+    model = None
+
+    def __init__(self,
+                 model,
+                 preprocessor=None,
+                 first_sequence='sentence',
+                 **kwargs):
+        """Create a plug pipeline instance.
+
+        @param model: The model_id of plug(damo/nlp_plug_text-generation_27B).
+        The default path to damo/nlp_plug_text-generation_27B can be obtained by function
+        get_cache_dir("damo/nlp_plug_text-generation_27B"), the model should be downloaded to
+        this path before calling this class by model_id.
+        The model can be downloaded from the link on
+        https://modelscope.cn/models/damo/nlp_plug_text-generation_27B/summary.
+        After downloading, you should have a plug model structure like this:
+        /your/path/to/damo/nlp_plug_text-generation_27B
+            |_ config.json
+            |_ configuration.json
+            |_ ds_zero-offload_10B_config.json
+            |_ vocab.txt
+            |_ model <-- an empty directory
+
+        Model binaries shall be downloaded separately to populate the model directory, so that
+        the model directory would contain the following binaries:
+            |_ model
+                |_ mp_rank_00_model_states.pt
+                |_ mp_rank_01_model_states.pt
+                |_ mp_rank_02_model_states.pt
+                |_ mp_rank_03_model_states.pt
+                |_ mp_rank_04_model_states.pt
+                |_ mp_rank_05_model_states.pt
+                |_ mp_rank_06_model_states.pt
+                |_ mp_rank_07_model_states.pt
+        @param preprocessor: The optional preprocessor, if not passed in, a TextGenerationPreprocessor will
+            be used as default.
+        @param first_sequence: The first_sequence key name if the input format is a dict.
+        @param kwargs:
+            sequence_length: The input sequence_length.
+        """
+        if preprocessor is None:
+            preprocessor = TextGenerationPreprocessor(
+                model,
+                first_sequence=first_sequence,
+                sequence_length=kwargs.pop('sequence_length', 512))
+        super().__init__(model, preprocessor=preprocessor, **kwargs)
+        assert hasattr(preprocessor, 'tokenizer')
+        self.cls_token_id = preprocessor.tokenizer.cls_token_id
+
+    @classmethod
+    def _forward_one(cls, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        with torch.no_grad():
+            return cls.model.generate(inputs['inputs'],
+                                      **inputs['forward_params'])
+
+    def _sanitize_parameters(self, **pipeline_parameters):
+        return {}, pipeline_parameters, {}
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        batch_size = inputs['input_ids'].shape[0]
+        dec_input_ids = torch.full([batch_size, 1],
+                                   self.cls_token_id,
+                                   dtype=torch.long)
+        inputs['dec_input_ids'] = dec_input_ids
+        res = super().forward(inputs, **forward_params)
+        return res
+
+    @classmethod
+    def _instantiate_one(cls, rank, model_dir, **kwargs):
+        cls.model = DistributedPlug(model_dir, rank, **kwargs)
+        cls.model.eval()
+
+    def postprocess(self, inputs: Dict[str, Any],
+                    **postprocess_params) -> Dict[str, str]:
+        """process the prediction results
+
+        Args:
+            inputs (Dict[str, Any]): _description_
+
+        Returns:
+            Dict[str, str]: the prediction results
+        """
+        from modelscope.outputs import OutputKeys
+        generate_context = inputs['generate_context']
+        generate_context = ''.join(
+            self.preprocessor.tokenizer.convert_ids_to_tokens(
+                generate_context)).replace('[UNK]', '“').replace('##', '')
+        return {OutputKeys.TEXT: generate_context}
diff --git a/modelscope/pipelines/nlp/fill_mask_pipeline.py b/modelscope/pipelines/nlp/fill_mask_pipeline.py
index caba4122..db6b61c6 100644
--- a/modelscope/pipelines/nlp/fill_mask_pipeline.py
+++ b/modelscope/pipelines/nlp/fill_mask_pipeline.py
@@ -101,7 +101,7 @@ class FillMaskPipeline(Pipeline):
     def forward(self, inputs: Dict[str, Any],
                 **forward_params) -> Dict[str, Any]:
         with torch.no_grad():
-            return self.model(inputs, **forward_params)
+            return self.model(**inputs, **forward_params)
 
     def postprocess(self, inputs: Dict[str, Tensor]) -> Dict[str, Tensor]:
         """process the prediction results
diff --git a/modelscope/pipelines/nlp/fill_mask_ponet_pipeline.py b/modelscope/pipelines/nlp/fill_mask_ponet_pipeline.py
new file mode 100644
index 00000000..9770fc38
--- /dev/null
+++ b/modelscope/pipelines/nlp/fill_mask_ponet_pipeline.py
@@ -0,0 +1,136 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+from typing import Any, Dict, Optional, Union
+
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Pipeline, Tensor
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import FillMaskPoNetPreprocessor, Preprocessor
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+
+__all__ = ['FillMaskPonetPipeline']
+_type_map = {'ponet': 'bert'}
+
+
+@PIPELINES.register_module(
+    Tasks.fill_mask, module_name=Pipelines.fill_mask_ponet)
+class FillMaskPonetPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: Optional[Preprocessor] = None,
+                 first_sequence='sentence',
+                 **kwargs):
+        """Use `model` and `preprocessor` to create a nlp fill mask pipeline for prediction
+
+        Args:
+            model (str or Model): Supply either a local model dir which supported fill-mask task,
+            or a fill-mask model id from the model hub, or a torch model instance.
+            preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
+            the model if supplied.
+            first_sequence: The key to read the sentence in.
+
+            NOTE: Inputs of type 'str' are also supported. In this scenario, the 'first_sequence'
+            param will have no effect.
+
+            Example:
+            >>> from modelscope.pipelines import pipeline
+            >>> pipeline_ins = pipeline(
+                    'fill-mask', model='damo/nlp_ponet_fill-mask_english-base')
+            >>> input = 'Everything in [MASK] you call reality is really [MASK] a reflection of your [MASK].'
+            >>> print(pipeline_ins(input))
+
+            NOTE2: Please pay attention to the model's special tokens.
+            If bert based model(bert, structbert, etc.) is used, the mask token is '[MASK]'.
+            If the xlm-roberta(xlm-roberta, veco, etc.) based model is used, the mask token is '<mask>'.
+            To view other examples plese check the tests/pipelines/test_fill_mask.py.
+        """
+        fill_mask_model = model if isinstance(
+            model, Model) else Model.from_pretrained(model)
+
+        self.config = Config.from_file(
+            os.path.join(fill_mask_model.model_dir, ModelFile.CONFIGURATION))
+
+        if preprocessor is None:
+            preprocessor = FillMaskPoNetPreprocessor(
+                fill_mask_model.model_dir,
+                first_sequence=first_sequence,
+                second_sequence=None,
+                sequence_length=kwargs.pop('sequence_length', 512))
+
+        fill_mask_model.eval()
+        super().__init__(
+            model=fill_mask_model, preprocessor=preprocessor, **kwargs)
+
+        self.preprocessor = preprocessor
+
+        self.tokenizer = preprocessor.tokenizer
+        self.mask_id = {'roberta': 250001, 'bert': 103}
+
+        self.rep_map = {
+            'bert': {
+                '[unused0]': '',
+                '[PAD]': '',
+                '[unused1]': '',
+                r' +': ' ',
+                '[SEP]': '',
+                '[unused2]': '',
+                '[CLS]': '',
+                '[UNK]': ''
+            },
+            'roberta': {
+                r' +': ' ',
+                '<mask>': '<q>',
+                '<pad>': '',
+                '<s>': '',
+                '</s>': '',
+                '<unk>': ' '
+            }
+        }
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        with torch.no_grad():
+            return self.model(**inputs, **forward_params)
+
+    def postprocess(self, inputs: Dict[str, Tensor]) -> Dict[str, Tensor]:
+        """process the prediction results
+
+        Args:
+            inputs (Dict[str, Any]): _description_
+
+        Returns:
+            Dict[str, str]: the prediction results
+        """
+        import numpy as np
+        logits = inputs[OutputKeys.LOGITS].detach().cpu().numpy()
+        input_ids = inputs[OutputKeys.INPUT_IDS].detach().cpu().numpy()
+        pred_ids = np.argmax(logits, axis=-1)
+        model_type = self.model.config.model_type
+        process_type = model_type if model_type in self.mask_id else _type_map[
+            model_type]
+        rst_ids = np.where(input_ids == self.mask_id[process_type], pred_ids,
+                           input_ids)
+
+        def rep_tokens(string, rep_map):
+            for k, v in rep_map.items():
+                string = string.replace(k, v)
+            return string.strip()
+
+        pred_strings = []
+        for ids in rst_ids:  # batch
+            if 'language' in self.config.model and self.config.model.language == 'zh':
+                pred_string = self.tokenizer.convert_ids_to_tokens(ids)
+                pred_string = ''.join(pred_string)
+            else:
+                pred_string = self.tokenizer.decode(ids)
+            pred_string = rep_tokens(pred_string, self.rep_map[process_type])
+            pred_strings.append(pred_string)
+
+        return {OutputKeys.TEXT: pred_strings}
diff --git a/modelscope/pipelines/nlp/passage_ranking_pipeline.py b/modelscope/pipelines/nlp/passage_ranking_pipeline.py
new file mode 100644
index 00000000..c03e7b93
--- /dev/null
+++ b/modelscope/pipelines/nlp/passage_ranking_pipeline.py
@@ -0,0 +1,58 @@
+from typing import Any, Dict, Optional, Union
+
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import PassageRankingPreprocessor, Preprocessor
+from modelscope.utils.constant import Tasks
+
+__all__ = ['PassageRankingPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.passage_ranking, module_name=Pipelines.passage_ranking)
+class PassageRankingPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: Optional[Preprocessor] = None,
+                 **kwargs):
+        """Use `model` and `preprocessor` to create a nlp word segment pipeline for prediction.
+
+        Args:
+            model (str or Model): Supply either a local model dir which supported the WS task,
+            or a model id from the model hub, or a torch model instance.
+            preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
+            the model if supplied.
+            sequence_length: Max sequence length in the user's custom scenario. 128 will be used as a default value.
+        """
+        model = model if isinstance(model,
+                                    Model) else Model.from_pretrained(model)
+
+        if preprocessor is None:
+            preprocessor = PassageRankingPreprocessor(
+                model.model_dir if isinstance(model, Model) else model,
+                sequence_length=kwargs.pop('sequence_length', 128))
+        model.eval()
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        with torch.no_grad():
+            return {**self.model(inputs, **forward_params)}
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        """process the prediction results
+        Args:
+            inputs (Dict[str, Any]): _description_
+
+        Returns:
+            Dict[str, Any]: the predicted text representation
+        """
+        pred_list = inputs[OutputKeys.SCORES]
+
+        return {OutputKeys.SCORES: pred_list}
diff --git a/modelscope/pipelines/nlp/sentence_embedding_pipeline.py b/modelscope/pipelines/nlp/sentence_embedding_pipeline.py
new file mode 100644
index 00000000..3ef6d06b
--- /dev/null
+++ b/modelscope/pipelines/nlp/sentence_embedding_pipeline.py
@@ -0,0 +1,60 @@
+from typing import Any, Dict, Optional, Union
+
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import (Preprocessor,
+                                      SentenceEmbeddingPreprocessor)
+from modelscope.utils.constant import Tasks
+
+__all__ = ['SentenceEmbeddingPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.sentence_embedding, module_name=Pipelines.sentence_embedding)
+class SentenceEmbeddingPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: Optional[Preprocessor] = None,
+                 first_sequence='first_sequence',
+                 **kwargs):
+        """Use `model` and `preprocessor` to create a nlp text dual encoder then generates the text representation.
+        Args:
+            model (str or Model): Supply either a local model dir which supported the WS task,
+            or a model id from the model hub, or a torch model instance.
+            preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
+            the model if supplied.
+            sequence_length: Max sequence length in the user's custom scenario. 128 will be used as a default value.
+        """
+        model = model if isinstance(model,
+                                    Model) else Model.from_pretrained(model)
+        if preprocessor is None:
+            preprocessor = SentenceEmbeddingPreprocessor(
+                model.model_dir if isinstance(model, Model) else model,
+                first_sequence=first_sequence,
+                sequence_length=kwargs.pop('sequence_length', 128))
+        model.eval()
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        with torch.no_grad():
+            return {**self.model(inputs, **forward_params)}
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        """process the prediction results
+
+        Args:
+            inputs (Dict[str, Any]): _description_
+
+        Returns:
+            Dict[str, Any]: the predicted text representation
+        """
+        embs = inputs[OutputKeys.TEXT_EMBEDDING]
+        scores = inputs[OutputKeys.SCORES]
+        return {OutputKeys.TEXT_EMBEDDING: embs, OutputKeys.SCORES: scores}
diff --git a/modelscope/pipelines/nlp/sequence_classification_pipeline_base.py b/modelscope/pipelines/nlp/sequence_classification_pipeline_base.py
index 25d68993..28bbc732 100644
--- a/modelscope/pipelines/nlp/sequence_classification_pipeline_base.py
+++ b/modelscope/pipelines/nlp/sequence_classification_pipeline_base.py
@@ -35,7 +35,7 @@ class SequenceClassificationPipelineBase(Pipeline):
     def forward(self, inputs: Dict[str, Any],
                 **forward_params) -> Dict[str, Any]:
         with torch.no_grad():
-            return self.model(inputs, **forward_params)
+            return self.model(**inputs, **forward_params)
 
     def postprocess(self,
                     inputs: Dict[str, Any],
diff --git a/modelscope/pipelines/nlp/table_question_answering_pipeline.py b/modelscope/pipelines/nlp/table_question_answering_pipeline.py
new file mode 100644
index 00000000..96bfbc34
--- /dev/null
+++ b/modelscope/pipelines/nlp/table_question_answering_pipeline.py
@@ -0,0 +1,283 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from typing import Any, Dict, Union
+
+from transformers import BertTokenizer
+
+from modelscope.metainfo import Pipelines
+from modelscope.models import Model
+from modelscope.models.nlp import TableQuestionAnswering
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import TableQuestionAnsweringPreprocessor
+from modelscope.preprocessors.star3.fields.database import Database
+from modelscope.preprocessors.star3.fields.struct import Constant, SQLQuery
+from modelscope.utils.constant import ModelFile, Tasks
+
+__all__ = ['TableQuestionAnsweringPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.table_question_answering,
+    module_name=Pipelines.table_question_answering_pipeline)
+class TableQuestionAnsweringPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[TableQuestionAnswering, str],
+                 preprocessor: TableQuestionAnsweringPreprocessor = None,
+                 db: Database = None,
+                 **kwargs):
+        """use `model` and `preprocessor` to create a table question answering prediction pipeline
+
+        Args:
+            model (TableQuestionAnswering): a model instance
+            preprocessor (TableQuestionAnsweringPreprocessor): a preprocessor instance
+            db (Database): a database to store tables in the database
+        """
+        model = model if isinstance(
+            model, TableQuestionAnswering) else Model.from_pretrained(model)
+        if preprocessor is None:
+            preprocessor = TableQuestionAnsweringPreprocessor(model.model_dir)
+
+        # initilize tokenizer
+        self.tokenizer = BertTokenizer(
+            os.path.join(model.model_dir, ModelFile.VOCAB_FILE))
+
+        # initialize database
+        if db is None:
+            self.db = Database(
+                tokenizer=self.tokenizer,
+                table_file_path=os.path.join(model.model_dir, 'table.json'),
+                syn_dict_file_path=os.path.join(model.model_dir,
+                                                'synonym.txt'))
+        else:
+            self.db = db
+
+        constant = Constant()
+        self.agg_ops = constant.agg_ops
+        self.cond_ops = constant.cond_ops
+        self.cond_conn_ops = constant.cond_conn_ops
+        self.action_ops = constant.action_ops
+        self.max_select_num = constant.max_select_num
+        self.max_where_num = constant.max_where_num
+        self.col_type_dict = constant.col_type_dict
+        self.schema_link_dict = constant.schema_link_dict
+
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+
+    def post_process_multi_turn(self, history_sql, result, table):
+        action = self.action_ops[result['action']]
+        headers = table['header_name']
+        current_sql = result['sql']
+
+        if history_sql is None:
+            return current_sql
+
+        if action == 'out_of_scripts':
+            return history_sql
+
+        elif action == 'switch_table':
+            return current_sql
+
+        elif action == 'restart':
+            return current_sql
+
+        elif action == 'firstTurn':
+            return current_sql
+
+        elif action == 'del_focus':
+            pre_final_sql = history_sql
+            pre_sels = []
+            pre_aggs = []
+            for idx, seli in enumerate(pre_final_sql['sel']):
+                if seli not in current_sql['sel']:
+                    pre_sels.append(seli)
+                    pre_aggs.append(pre_final_sql['agg'][idx])
+
+            if len(pre_sels) < 1:
+                pre_sels.append(len(headers))
+                pre_aggs.append(0)
+            pre_final_sql['sel'] = pre_sels
+            pre_final_sql['agg'] = pre_aggs
+
+            final_conds = []
+            for condi in pre_final_sql['conds']:
+                if condi[0] < len(headers):
+                    final_conds.append(condi)
+            if len(final_conds) < 1:
+                final_conds.append([len(headers), 2, 'Null'])
+            pre_final_sql['conds'] = final_conds
+
+            return pre_final_sql
+
+        elif action == 'change_agg_only':
+            pre_final_sql = history_sql
+            pre_sels = []
+            pre_aggs = []
+            for idx, seli in enumerate(pre_final_sql['sel']):
+                if seli in current_sql['sel']:
+                    pre_sels.append(seli)
+                    changed_aggi = -1
+                    for idx_single, aggi in enumerate(current_sql['agg']):
+                        if current_sql['sel'][idx_single] == seli:
+                            changed_aggi = aggi
+                    pre_aggs.append(changed_aggi)
+                else:
+                    pre_sels.append(seli)
+                    pre_aggs.append(pre_final_sql['agg'][idx])
+            pre_final_sql['sel'] = pre_sels
+            pre_final_sql['agg'] = pre_aggs
+
+            return pre_final_sql
+
+        elif action == 'change_focus_total':
+            pre_final_sql = history_sql
+            pre_sels = current_sql['sel']
+            pre_aggs = current_sql['agg']
+
+            pre_final_sql['sel'] = pre_sels
+            pre_final_sql['agg'] = pre_aggs
+            for pre_condi in current_sql['conds']:
+                if pre_condi[0] < len(headers):
+                    in_flag = False
+                    for history_condi in history_sql['conds']:
+                        if pre_condi[0] == history_condi[0]:
+                            in_flag = True
+                    if not in_flag:
+                        pre_final_sql['conds'].append(pre_condi)
+
+            return pre_final_sql
+
+        elif action == 'del_cond':
+            pre_final_sql = history_sql
+
+            final_conds = []
+
+            for idx, condi in enumerate(pre_final_sql['conds']):
+                if condi[0] not in current_sql['sel']:
+                    final_conds.append(condi)
+            pre_final_sql['conds'] = final_conds
+
+            final_conds = []
+            for condi in pre_final_sql['conds']:
+                if condi[0] < len(headers):
+                    final_conds.append(condi)
+            if len(final_conds) < 1:
+                final_conds.append([len(headers), 2, 'Null'])
+            pre_final_sql['conds'] = final_conds
+
+            return pre_final_sql
+
+        elif action == 'change_cond':
+            pre_final_sql = history_sql
+            final_conds = []
+
+            for idx, condi in enumerate(pre_final_sql['conds']):
+                in_single_flag = False
+                for single_condi in current_sql['conds']:
+                    if condi[0] == single_condi[0]:
+                        in_single_flag = True
+                        final_conds.append(single_condi)
+                if not in_single_flag:
+                    final_conds.append(condi)
+            pre_final_sql['conds'] = final_conds
+
+            final_conds = []
+            for condi in pre_final_sql['conds']:
+                if condi[0] < len(headers):
+                    final_conds.append(condi)
+            if len(final_conds) < 1:
+                final_conds.append([len(headers), 2, 'Null', 'Null'])
+            pre_final_sql['conds'] = final_conds
+
+            return pre_final_sql
+
+        elif action == 'add_cond':
+            pre_final_sql = history_sql
+            final_conds = pre_final_sql['conds']
+            for idx, condi in enumerate(current_sql['conds']):
+                if condi[0] < len(headers):
+                    final_conds.append(condi)
+            pre_final_sql['conds'] = final_conds
+
+            final_conds = []
+            for condi in pre_final_sql['conds']:
+                if condi[0] < len(headers):
+                    final_conds.append(condi)
+            if len(final_conds) < 1:
+                final_conds.append([len(headers), 2, 'Null'])
+            pre_final_sql['conds'] = final_conds
+
+            return pre_final_sql
+
+        else:
+            return current_sql
+
+    def sql_dict_to_str(self, result, table):
+        """
+        convert sql struct to string
+        """
+        header_names = table['header_name'] + ['空列']
+        header_ids = table['header_id'] + ['null']
+        sql = result['sql']
+
+        str_sel_list, sql_sel_list = [], []
+        for idx, sel in enumerate(sql['sel']):
+            header_name = header_names[sel]
+            header_id = '`%s`.`%s`' % (table['table_id'], header_ids[sel])
+            if sql['agg'][idx] == 0:
+                str_sel_list.append(header_name)
+                sql_sel_list.append(header_id)
+            else:
+                str_sel_list.append(self.agg_ops[sql['agg'][idx]] + '( '
+                                    + header_name + ' )')
+                sql_sel_list.append(self.agg_ops[sql['agg'][idx]] + '( '
+                                    + header_id + ' )')
+
+        str_cond_list, sql_cond_list = [], []
+        for cond in sql['conds']:
+            header_name = header_names[cond[0]]
+            header_id = '`%s`.`%s`' % (table['table_id'], header_ids[cond[0]])
+            op = self.cond_ops[cond[1]]
+            value = cond[2]
+            str_cond_list.append('( ' + header_name + ' ' + op + ' "' + value
+                                 + '" )')
+            sql_cond_list.append('( ' + header_id + ' ' + op + ' "' + value
+                                 + '" )')
+
+        cond = ' ' + self.cond_conn_ops[sql['cond_conn_op']] + ' '
+
+        final_str = 'SELECT %s FROM %s WHERE %s' % (', '.join(str_sel_list),
+                                                    table['table_name'],
+                                                    cond.join(str_cond_list))
+        final_sql = 'SELECT %s FROM `%s` WHERE %s' % (', '.join(sql_sel_list),
+                                                      table['table_id'],
+                                                      cond.join(sql_cond_list))
+        sql = SQLQuery(
+            string=final_str, query=final_sql, sql_result=result['sql'])
+
+        return sql
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]:
+        """process the prediction results
+
+        Args:
+            inputs (Dict[str, Any]): _description_
+
+        Returns:
+            Dict[str, str]: the prediction results
+        """
+        result = inputs['result']
+        history_sql = inputs['history_sql']
+        result['sql'] = self.post_process_multi_turn(
+            history_sql=history_sql,
+            result=result,
+            table=self.db.tables[result['table_id']])
+        sql = self.sql_dict_to_str(
+            result=result, table=self.db.tables[result['table_id']])
+        output = {OutputKeys.OUTPUT: sql, OutputKeys.HISTORY: result['sql']}
+        return output
+
+    def _collate_fn(self, data):
+        return data
diff --git a/modelscope/pipelines/nlp/token_classification_pipeline.py b/modelscope/pipelines/nlp/token_classification_pipeline.py
new file mode 100644
index 00000000..804f8146
--- /dev/null
+++ b/modelscope/pipelines/nlp/token_classification_pipeline.py
@@ -0,0 +1,92 @@
+from typing import Any, Dict, Optional, Union
+
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Pipeline, Tensor
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import (Preprocessor,
+                                      TokenClassificationPreprocessor)
+from modelscope.utils.constant import Tasks
+
+__all__ = ['TokenClassificationPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.token_classification, module_name=Pipelines.part_of_speech)
+class TokenClassificationPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: Optional[Preprocessor] = None,
+                 **kwargs):
+        """use `model` and `preprocessor` to create a token classification pipeline for prediction
+
+        Args:
+            model (str or Model): A model instance or a model local dir or a model id in the model hub.
+            preprocessor (Preprocessor): a preprocessor instance, must not be None.
+        """
+        assert isinstance(model, str) or isinstance(model, Model), \
+            'model must be a single str or Model'
+        model = model if isinstance(model,
+                                    Model) else Model.from_pretrained(model)
+        if preprocessor is None:
+            preprocessor = TokenClassificationPreprocessor(
+                model.model_dir,
+                sequence_length=kwargs.pop('sequence_length', 128))
+        model.eval()
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+        self.id2label = getattr(model, 'id2label')
+        assert self.id2label is not None, 'Cannot convert id to the original label, please pass in the mapping ' \
+                                          'as a parameter or make sure the preprocessor has the attribute.'
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        text = inputs.pop(OutputKeys.TEXT)
+        with torch.no_grad():
+            return {
+                **self.model(inputs, **forward_params), OutputKeys.TEXT: text
+            }
+
+    def postprocess(self, inputs: Dict[str, Any],
+                    **postprocess_params) -> Dict[str, str]:
+        """process the prediction results
+
+        Args:
+            inputs (Dict[str, Any]): _description_
+
+        Returns:
+            Dict[str, str]: the prediction results
+        """
+
+        pred_list = inputs['predictions']
+        labels = []
+        for pre in pred_list:
+            labels.append(self.id2label[pre])
+        labels = labels[1:-1]
+        chunks = []
+        tags = []
+        chunk = ''
+        assert len(inputs['text']) == len(labels)
+        for token, label in zip(inputs['text'], labels):
+            if label[0] == 'B' or label[0] == 'I':
+                chunk += token
+            else:
+                chunk += token
+                chunks.append(chunk)
+                chunk = ''
+                tags.append(label.split('-')[-1])
+        if chunk:
+            chunks.append(chunk)
+            tags.append(label.split('-')[-1])
+        pos_result = []
+        seg_result = ' '.join(chunks)
+        for chunk, tag in zip(chunks, tags):
+            pos_result.append({OutputKeys.WORD: chunk, OutputKeys.LABEL: tag})
+        outputs = {
+            OutputKeys.OUTPUT: seg_result,
+            OutputKeys.LABELS: pos_result
+        }
+        return outputs
diff --git a/modelscope/pipelines/nlp/word_segmentation_pipeline.py b/modelscope/pipelines/nlp/word_segmentation_pipeline.py
index 66a5c524..7e8b22bc 100644
--- a/modelscope/pipelines/nlp/word_segmentation_pipeline.py
+++ b/modelscope/pipelines/nlp/word_segmentation_pipeline.py
@@ -62,7 +62,7 @@ class WordSegmentationPipeline(Pipeline):
         text = inputs.pop(OutputKeys.TEXT)
         with torch.no_grad():
             return {
-                **self.model(inputs, **forward_params), OutputKeys.TEXT: text
+                **self.model(**inputs, **forward_params), OutputKeys.TEXT: text
             }
 
     def postprocess(self, inputs: Dict[str, Any],
@@ -94,4 +94,4 @@ class WordSegmentationPipeline(Pipeline):
         if chunk:
             chunks.append(chunk)
         seg_result = ' '.join(chunks)
-        return {OutputKeys.OUTPUT: seg_result}
+        return {OutputKeys.OUTPUT: seg_result, OutputKeys.LABELS: []}
diff --git a/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py b/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
index e39cb0e1..38c0ee77 100644
--- a/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
@@ -85,7 +85,7 @@ class ZeroShotClassificationPipeline(Pipeline):
     def forward(self, inputs: Dict[str, Any],
                 **forward_params) -> Dict[str, Any]:
         with torch.no_grad():
-            return self.model(inputs, **forward_params)
+            return self.model(**inputs, **forward_params)
 
     def postprocess(self,
                     inputs: Dict[str, Any],
diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py
index 9f7d595e..ba03a35e 100644
--- a/modelscope/preprocessors/__init__.py
+++ b/modelscope/preprocessors/__init__.py
@@ -15,21 +15,23 @@ if TYPE_CHECKING:
                         ImageDenoisePreprocessor)
     from .kws import WavToLists
     from .multi_modal import (OfaPreprocessor, MPlugPreprocessor)
-    from .nlp import (Tokenize, SequenceClassificationPreprocessor,
-                      TextGenerationPreprocessor,
-                      TokenClassificationPreprocessor,
-                      SingleSentenceClassificationPreprocessor,
-                      PairSentenceClassificationPreprocessor,
-                      FillMaskPreprocessor, ZeroShotClassificationPreprocessor,
-                      NERPreprocessor, TextErrorCorrectionPreprocessor,
-                      FaqQuestionAnsweringPreprocessor,
-                      RelationExtractionPreprocessor)
-    from .slp import DocumentSegmentationPreprocessor
+    from .nlp import (
+        Tokenize, SequenceClassificationPreprocessor,
+        TextGenerationPreprocessor, TokenClassificationPreprocessor,
+        SingleSentenceClassificationPreprocessor,
+        PairSentenceClassificationPreprocessor, FillMaskPreprocessor,
+        ZeroShotClassificationPreprocessor, NERPreprocessor,
+        TextErrorCorrectionPreprocessor, FaqQuestionAnsweringPreprocessor,
+        SequenceLabelingPreprocessor, RelationExtractionPreprocessor,
+        DocumentSegmentationPreprocessor, FillMaskPoNetPreprocessor,
+        PassageRankingPreprocessor,
+        WordSegmentationBlankSetToLabelPreprocessor)
     from .space import (DialogIntentPredictionPreprocessor,
                         DialogModelingPreprocessor,
                         DialogStateTrackingPreprocessor)
     from .video import ReadVideoData, MovieSceneSegmentationPreprocessor
     from .star import ConversationalTextToSqlPreprocessor
+    from .star3 import TableQuestionAnsweringPreprocessor
 
 else:
     _import_structure = {
@@ -51,16 +53,19 @@ else:
             'SingleSentenceClassificationPreprocessor',
             'PairSentenceClassificationPreprocessor', 'FillMaskPreprocessor',
             'ZeroShotClassificationPreprocessor', 'NERPreprocessor',
+            'SentenceEmbeddingPreprocessor', 'PassageRankingPreprocessor',
             'TextErrorCorrectionPreprocessor',
-            'FaqQuestionAnsweringPreprocessor',
-            'RelationExtractionPreprocessor'
+            'FaqQuestionAnsweringPreprocessor', 'SequenceLabelingPreprocessor',
+            'RelationExtractionPreprocessor',
+            'WordSegmentationBlankSetToLabelPreprocessor',
+            'DocumentSegmentationPreprocessor', 'FillMaskPoNetPreprocessor'
         ],
-        'slp': ['DocumentSegmentationPreprocessor'],
         'space': [
             'DialogIntentPredictionPreprocessor', 'DialogModelingPreprocessor',
             'DialogStateTrackingPreprocessor', 'InputFeatures'
         ],
         'star': ['ConversationalTextToSqlPreprocessor'],
+        'star3': ['TableQuestionAnsweringPreprocessor'],
     }
 
     import sys
diff --git a/modelscope/preprocessors/audio.py b/modelscope/preprocessors/audio.py
index 10057034..1e659218 100644
--- a/modelscope/preprocessors/audio.py
+++ b/modelscope/preprocessors/audio.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import io
 import os
 from typing import Any, Dict, Tuple, Union
@@ -6,9 +8,10 @@ import numpy as np
 import scipy.io.wavfile as wav
 import torch
 
+from modelscope.fileio import File
+from modelscope.preprocessors import Preprocessor
+from modelscope.preprocessors.builder import PREPROCESSORS
 from modelscope.utils.constant import Fields
-from . import Preprocessor
-from .builder import PREPROCESSORS
 
 
 def load_kaldi_feature_transform(filename):
@@ -201,7 +204,8 @@ class LinearAECAndFbank(Preprocessor):
         if isinstance(inputs, bytes):
             inputs = io.BytesIO(inputs)
         elif isinstance(inputs, str):
-            pass
+            file_bytes = File.read(inputs)
+            inputs = io.BytesIO(file_bytes)
         else:
             raise TypeError(f'Unsupported input type: {type(inputs)}.')
         sample_rate, data = wav.read(inputs)
diff --git a/modelscope/preprocessors/multi_modal.py b/modelscope/preprocessors/multi_modal.py
index 2416ea86..6cacb235 100644
--- a/modelscope/preprocessors/multi_modal.py
+++ b/modelscope/preprocessors/multi_modal.py
@@ -9,7 +9,7 @@ from PIL import Image
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.metainfo import Preprocessors
 from modelscope.pipelines.base import Input
-from modelscope.preprocessors.image import load_image
+from modelscope.preprocessors import load_image
 from modelscope.utils.config import Config
 from modelscope.utils.constant import Fields, ModeKeys, ModelFile, Tasks
 from .base import Preprocessor
diff --git a/modelscope/preprocessors/nlp/__init__.py b/modelscope/preprocessors/nlp/__init__.py
new file mode 100644
index 00000000..eee5e80f
--- /dev/null
+++ b/modelscope/preprocessors/nlp/__init__.py
@@ -0,0 +1,46 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .text_error_correction import TextErrorCorrectionPreprocessor
+    from .nlp_base import (
+        Tokenize, SequenceClassificationPreprocessor,
+        TextGenerationPreprocessor, TokenClassificationPreprocessor,
+        SingleSentenceClassificationPreprocessor,
+        PairSentenceClassificationPreprocessor, FillMaskPreprocessor,
+        ZeroShotClassificationPreprocessor, NERPreprocessor,
+        FaqQuestionAnsweringPreprocessor, SequenceLabelingPreprocessor,
+        RelationExtractionPreprocessor, DocumentSegmentationPreprocessor,
+        FillMaskPoNetPreprocessor, PassageRankingPreprocessor,
+        WordSegmentationBlankSetToLabelPreprocessor)
+
+else:
+    _import_structure = {
+        'nlp_base': [
+            'Tokenize', 'SequenceClassificationPreprocessor',
+            'TextGenerationPreprocessor', 'TokenClassificationPreprocessor',
+            'SingleSentenceClassificationPreprocessor',
+            'PairSentenceClassificationPreprocessor', 'FillMaskPreprocessor',
+            'ZeroShotClassificationPreprocessor', 'NERPreprocessor',
+            'SentenceEmbeddingPreprocessor', 'PassageRankingPreprocessor',
+            'FaqQuestionAnsweringPreprocessor', 'SequenceLabelingPreprocessor',
+            'RelationExtractionPreprocessor',
+            'WordSegmentationBlankSetToLabelPreprocessor',
+            'DocumentSegmentationPreprocessor', 'FillMaskPoNetPreprocessor'
+        ],
+        'text_error_correction': [
+            'TextErrorCorrectionPreprocessor',
+        ],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/preprocessors/nlp.py b/modelscope/preprocessors/nlp/nlp_base.py
similarity index 58%
rename from modelscope/preprocessors/nlp.py
rename to modelscope/preprocessors/nlp/nlp_base.py
index 825611d6..0a2495af 100644
--- a/modelscope/preprocessors/nlp.py
+++ b/modelscope/preprocessors/nlp/nlp_base.py
@@ -1,6 +1,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
 import os.path as osp
+import re
 import uuid
 from typing import Any, Dict, Iterable, Optional, Tuple, Union
 
@@ -8,13 +9,18 @@ import numpy as np
 from transformers import AutoTokenizer, BertTokenizerFast
 
 from modelscope.metainfo import Models, Preprocessors
+from modelscope.models.nlp.structbert import SbertTokenizerFast
 from modelscope.outputs import OutputKeys
-from modelscope.utils.config import ConfigFields
-from modelscope.utils.constant import Fields, InputFields, ModeKeys
+from modelscope.preprocessors.base import Preprocessor
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.utils.config import Config, ConfigFields
+from modelscope.utils.constant import Fields, InputFields, ModeKeys, ModelFile
 from modelscope.utils.hub import get_model_type, parse_label_mapping
+from modelscope.utils.logger import get_logger
+from modelscope.utils.nlp import import_external_nltk_data
 from modelscope.utils.type_assert import type_assert
-from .base import Preprocessor
-from .builder import PREPROCESSORS
+
+logger = get_logger()
 
 __all__ = [
     'Tokenize', 'SequenceClassificationPreprocessor',
@@ -22,8 +28,10 @@ __all__ = [
     'PairSentenceClassificationPreprocessor',
     'SingleSentenceClassificationPreprocessor', 'FillMaskPreprocessor',
     'ZeroShotClassificationPreprocessor', 'NERPreprocessor',
-    'TextErrorCorrectionPreprocessor', 'FaqQuestionAnsweringPreprocessor',
-    'RelationExtractionPreprocessor'
+    'SentenceEmbeddingPreprocessor', 'PassageRankingPreprocessor',
+    'FaqQuestionAnsweringPreprocessor', 'SequenceLabelingPreprocessor',
+    'RelationExtractionPreprocessor', 'DocumentSegmentationPreprocessor',
+    'FillMaskPoNetPreprocessor'
 ]
 
 
@@ -92,6 +100,7 @@ class SequenceClassificationPreprocessor(Preprocessor):
 
         text_a = new_data[self.first_sequence]
         text_b = new_data.get(self.second_sequence, None)
+
         feature = self.tokenizer(
             text_a,
             text_b,
@@ -103,7 +112,6 @@ class SequenceClassificationPreprocessor(Preprocessor):
         rst['input_ids'].append(feature['input_ids'])
         rst['attention_mask'].append(feature['attention_mask'])
         rst['token_type_ids'].append(feature['token_type_ids'])
-
         return rst
 
 
@@ -164,7 +172,8 @@ class NLPTokenizerPreprocessorBase(Preprocessor):
         """
 
         model_type = get_model_type(model_dir)
-        if model_type in (Models.structbert, Models.gpt3, Models.palm):
+        if model_type in (Models.structbert, Models.gpt3, Models.palm,
+                          Models.plug):
             from modelscope.models.nlp.structbert import SbertTokenizer
             return SbertTokenizer.from_pretrained(model_dir, use_fast=False)
         elif model_type == Models.veco:
@@ -259,6 +268,62 @@ class NLPTokenizerPreprocessorBase(Preprocessor):
                 output[OutputKeys.LABELS] = labels
 
 
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.passage_ranking)
+class PassageRankingPreprocessor(NLPTokenizerPreprocessorBase):
+    """The tokenizer preprocessor used in passage ranking model.
+    """
+
+    def __init__(self,
+                 model_dir: str,
+                 mode=ModeKeys.INFERENCE,
+                 *args,
+                 **kwargs):
+        """preprocess the data
+
+        Args:
+            model_dir (str): model path
+        """
+        super().__init__(model_dir, pair=True, mode=mode, *args, **kwargs)
+        self.model_dir: str = model_dir
+        self.first_sequence: str = kwargs.pop('first_sequence',
+                                              'source_sentence')
+        self.second_sequence = kwargs.pop('second_sequence',
+                                          'sentences_to_compare')
+        self.sequence_length = kwargs.pop('sequence_length', 128)
+
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_dir)
+
+    @type_assert(object, (str, tuple, Dict))
+    def __call__(self, data: Union[tuple, Dict]) -> Dict[str, Any]:
+        if isinstance(data, tuple):
+            sentence1, sentence2 = data
+        elif isinstance(data, dict):
+            sentence1 = data.get(self.first_sequence)
+            sentence2 = data.get(self.second_sequence)
+        if isinstance(sentence2, str):
+            sentence2 = [sentence2]
+        if isinstance(sentence1, str):
+            sentence1 = [sentence1]
+        sentence1 = sentence1 * len(sentence2)
+
+        max_seq_length = self.sequence_length
+        feature = self.tokenizer(
+            sentence1,
+            sentence2,
+            padding='max_length',
+            truncation=True,
+            max_length=max_seq_length,
+            return_tensors='pt')
+        if 'labels' in data:
+            labels = data['labels']
+            feature['labels'] = labels
+        if 'qid' in data:
+            qid = data['qid']
+            feature['qid'] = qid
+        return feature
+
+
 @PREPROCESSORS.register_module(
     Fields.nlp, module_name=Preprocessors.nli_tokenizer)
 @PREPROCESSORS.register_module(
@@ -289,6 +354,51 @@ class SingleSentenceClassificationPreprocessor(NLPTokenizerPreprocessorBase):
         super().__init__(model_dir, pair=False, mode=mode, **kwargs)
 
 
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.sentence_embedding)
+class SentenceEmbeddingPreprocessor(NLPTokenizerPreprocessorBase):
+    """The tokenizer preprocessor used in sentence embedding.
+    """
+
+    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
+        kwargs['truncation'] = kwargs.get('truncation', True)
+        kwargs['padding'] = kwargs.get(
+            'padding', False if mode == ModeKeys.INFERENCE else 'max_length')
+        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
+        super().__init__(model_dir, pair=False, mode=mode, **kwargs)
+
+    def __call__(self, data: Union[str, Dict]) -> Dict[str, Any]:
+        """process the raw input data
+
+        Args:
+            data Dict:
+                keys: "source_sentence" && "sentences_to_compare"
+                values: list of sentences
+                Example:
+                    {"source_sentence": ["how long it take to get a master's degree"],
+                     "sentences_to_compare": ["On average, students take about 18 to 24 months
+                     to complete a master's degree.",
+                     "On the other hand, some students prefer to go at a slower pace
+                     and choose to take several years to complete their studies.",
+                     "It can take anywhere from two semesters"]}
+        Returns:
+            Dict[str, Any]: the preprocessed data
+        """
+        source_sentence = data['source_sentence']
+        compare_sentences = data['sentences_to_compare']
+        sentences = []
+        sentences.append(source_sentence[0])
+        for sent in compare_sentences:
+            sentences.append(sent)
+
+        tokenized_inputs = self.tokenizer(
+            sentences,
+            return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None,
+            padding=True,
+            truncation=True)
+        return tokenized_inputs
+
+
 @PREPROCESSORS.register_module(
     Fields.nlp, module_name=Preprocessors.zero_shot_cls_tokenizer)
 class ZeroShotClassificationPreprocessor(NLPTokenizerPreprocessorBase):
@@ -627,15 +737,15 @@ class NERPreprocessor(Preprocessor):
 
 
 @PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.re_tokenizer)
-class RelationExtractionPreprocessor(Preprocessor):
-    """The tokenizer preprocessor used in normal RE task.
+    Fields.nlp, module_name=Preprocessors.sequence_labeling_tokenizer)
+class SequenceLabelingPreprocessor(Preprocessor):
+    """The tokenizer preprocessor used in normal NER task.
 
     NOTE: This preprocessor may be merged with the TokenClassificationPreprocessor in the next edition.
     """
 
     def __init__(self, model_dir: str, *args, **kwargs):
-        """preprocess the data
+        """preprocess the data via the vocab.txt from the `model_dir` path
 
         Args:
             model_dir (str): model path
@@ -645,8 +755,18 @@ class RelationExtractionPreprocessor(Preprocessor):
 
         self.model_dir: str = model_dir
         self.sequence_length = kwargs.pop('sequence_length', 512)
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            model_dir, use_fast=True)
+
+        if 'lstm' in model_dir or 'gcnn' in model_dir:
+            self.tokenizer = BertTokenizerFast.from_pretrained(
+                model_dir, use_fast=False)
+        elif 'structbert' in model_dir:
+            self.tokenizer = SbertTokenizerFast.from_pretrained(
+                model_dir, use_fast=False)
+        else:
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                model_dir, use_fast=False)
+        self.is_split_into_words = self.tokenizer.init_kwargs.get(
+            'is_split_into_words', False)
 
     @type_assert(object, str)
     def __call__(self, data: str) -> Dict[str, Any]:
@@ -663,54 +783,109 @@ class RelationExtractionPreprocessor(Preprocessor):
 
         # preprocess the data for the model input
         text = data
-        output = self.tokenizer([text], return_tensors='pt')
+        if self.is_split_into_words:
+            input_ids = []
+            label_mask = []
+            offset_mapping = []
+            for offset, token in enumerate(list(data)):
+                subtoken_ids = self.tokenizer.encode(
+                    token, add_special_tokens=False)
+                if len(subtoken_ids) == 0:
+                    subtoken_ids = [self.tokenizer.unk_token_id]
+                input_ids.extend(subtoken_ids)
+                label_mask.extend([1] + [0] * (len(subtoken_ids) - 1))
+                offset_mapping.extend([(offset, offset + 1)]
+                                      + [(offset + 1, offset + 1)]
+                                      * (len(subtoken_ids) - 1))
+            if len(input_ids) >= self.sequence_length - 2:
+                input_ids = input_ids[:self.sequence_length - 2]
+                label_mask = label_mask[:self.sequence_length - 2]
+                offset_mapping = offset_mapping[:self.sequence_length - 2]
+            input_ids = [self.tokenizer.cls_token_id
+                         ] + input_ids + [self.tokenizer.sep_token_id]
+            label_mask = [0] + label_mask + [0]
+            attention_mask = [1] * len(input_ids)
+        else:
+            encodings = self.tokenizer(
+                text,
+                add_special_tokens=True,
+                padding=True,
+                truncation=True,
+                max_length=self.sequence_length,
+                return_offsets_mapping=True)
+            input_ids = encodings['input_ids']
+            attention_mask = encodings['attention_mask']
+            word_ids = encodings.word_ids()
+            label_mask = []
+            offset_mapping = []
+            for i in range(len(word_ids)):
+                if word_ids[i] is None:
+                    label_mask.append(0)
+                elif word_ids[i] == word_ids[i - 1]:
+                    label_mask.append(0)
+                    offset_mapping[-1] = (offset_mapping[-1][0],
+                                          encodings['offset_mapping'][i][1])
+                else:
+                    label_mask.append(1)
+                    offset_mapping.append(encodings['offset_mapping'][i])
+
+        if not self.is_transformer_based_model:
+            input_ids = input_ids[1:-1]
+            attention_mask = attention_mask[1:-1]
+            label_mask = label_mask[1:-1]
         return {
             'text': text,
-            'input_ids': output['input_ids'],
-            'attention_mask': output['attention_mask'],
-            'offsets': output[0].offsets
+            'input_ids': input_ids,
+            'attention_mask': attention_mask,
+            'label_mask': label_mask,
+            'offset_mapping': offset_mapping
         }
 
 
 @PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.text_error_correction)
-class TextErrorCorrectionPreprocessor(Preprocessor):
-    """The preprocessor used in text correction task.
+    Fields.nlp, module_name=Preprocessors.re_tokenizer)
+class RelationExtractionPreprocessor(Preprocessor):
+    """The tokenizer preprocessor used in normal RE task.
+
+    NOTE: This preprocessor may be merged with the TokenClassificationPreprocessor in the next edition.
     """
 
     def __init__(self, model_dir: str, *args, **kwargs):
-        from fairseq.data import Dictionary
-        """preprocess the data via the vocab file from the `model_dir` path
+        """preprocess the data
 
         Args:
             model_dir (str): model path
         """
+
         super().__init__(*args, **kwargs)
-        self.vocab = Dictionary.load(osp.join(model_dir, 'dict.src.txt'))
 
+        self.model_dir: str = model_dir
+        self.sequence_length = kwargs.pop('sequence_length', 512)
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_dir, use_fast=True)
+
+    @type_assert(object, str)
     def __call__(self, data: str) -> Dict[str, Any]:
         """process the raw input data
 
         Args:
             data (str): a sentence
                 Example:
-                    '随着中国经济突飞猛近，建造工业与日俱增'
+                    'you are so handsome.'
+
         Returns:
             Dict[str, Any]: the preprocessed data
-            Example:
-            {'net_input':
-                {'src_tokens':tensor([1,2,3,4]),
-                'src_lengths': tensor([4])}
-            }
         """
 
-        text = ' '.join([x for x in data])
-        inputs = self.vocab.encode_line(
-            text, append_eos=True, add_if_not_exist=False)
-        lengths = inputs.size()
-        sample = dict()
-        sample['net_input'] = {'src_tokens': inputs, 'src_lengths': lengths}
-        return sample
+        # preprocess the data for the model input
+        text = data
+        output = self.tokenizer([text], return_tensors='pt')
+        return {
+            'text': text,
+            'input_ids': output['input_ids'],
+            'attention_mask': output['attention_mask'],
+            'offsets': output[0].offsets
+        }
 
 
 @PREPROCESSORS.register_module(
@@ -794,3 +969,297 @@ class FaqQuestionAnsweringPreprocessor(Preprocessor):
             max_length = self.MAX_LEN
         return self.tokenizer.batch_encode_plus(
             sentence_list, padding=True, max_length=max_length)
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.document_segmentation)
+class DocumentSegmentationPreprocessor(Preprocessor):
+
+    def __init__(self, model_dir: str, config, *args, **kwargs):
+        """preprocess the data
+
+        Args:
+            model_dir (str): model path
+        """
+
+        super().__init__(*args, **kwargs)
+
+        self.tokenizer = BertTokenizerFast.from_pretrained(
+            model_dir,
+            use_fast=True,
+        )
+        self.question_column_name = 'labels'
+        self.context_column_name = 'sentences'
+        self.example_id_column_name = 'example_id'
+        self.label_to_id = {'B-EOP': 0, 'O': 1}
+        self.target_specical_ids = set()
+        self.target_specical_ids.add(self.tokenizer.eos_token_id)
+        self.max_seq_length = config.max_position_embeddings
+        self.label_list = ['B-EOP', 'O']
+
+    def __call__(self, examples) -> Dict[str, Any]:
+        questions = examples[self.question_column_name]
+        contexts = examples[self.context_column_name]
+        example_ids = examples[self.example_id_column_name]
+        num_examples = len(questions)
+
+        sentences = []
+        for sentence_list in contexts:
+            sentence_list = [_ + '[EOS]' for _ in sentence_list]
+            sentences.append(sentence_list)
+
+        try:
+            tokenized_examples = self.tokenizer(
+                sentences,
+                is_split_into_words=True,
+                add_special_tokens=False,
+                return_token_type_ids=True,
+                return_attention_mask=True,
+            )
+        except Exception as e:
+            logger.error(e)
+            return {}
+
+        segment_ids = []
+        token_seq_labels = []
+        for example_index in range(num_examples):
+            example_input_ids = tokenized_examples['input_ids'][example_index]
+            example_labels = questions[example_index]
+            example_labels = [
+                self.label_to_id[_] if _ in self.label_to_id else -100
+                for _ in example_labels
+            ]
+            example_token_labels = []
+            segment_id = []
+            cur_seg_id = 1
+            for token_index in range(len(example_input_ids)):
+                if example_input_ids[token_index] in self.target_specical_ids:
+                    example_token_labels.append(example_labels[cur_seg_id - 1])
+                    segment_id.append(cur_seg_id)
+                    cur_seg_id += 1
+                else:
+                    example_token_labels.append(-100)
+                    segment_id.append(cur_seg_id)
+
+            segment_ids.append(segment_id)
+            token_seq_labels.append(example_token_labels)
+
+        tokenized_examples['segment_ids'] = segment_ids
+        tokenized_examples['token_seq_labels'] = token_seq_labels
+
+        new_segment_ids = []
+        new_token_seq_labels = []
+        new_input_ids = []
+        new_token_type_ids = []
+        new_attention_mask = []
+        new_example_ids = []
+        new_sentences = []
+
+        for example_index in range(num_examples):
+            example_input_ids = tokenized_examples['input_ids'][example_index]
+            example_token_type_ids = tokenized_examples['token_type_ids'][
+                example_index]
+            example_attention_mask = tokenized_examples['attention_mask'][
+                example_index]
+            example_segment_ids = tokenized_examples['segment_ids'][
+                example_index]
+            example_token_seq_labels = tokenized_examples['token_seq_labels'][
+                example_index]
+            example_sentences = contexts[example_index]
+            example_id = example_ids[example_index]
+            example_total_num_sentences = len(questions[example_index])
+            example_total_num_tokens = len(
+                tokenized_examples['input_ids'][example_index])
+            accumulate_length = [
+                i for i, x in enumerate(tokenized_examples['input_ids']
+                                        [example_index])
+                if x == self.tokenizer.eos_token_id
+            ]
+            samples_boundary = []
+            left_index = 0
+            sent_left_index = 0
+            sent_i = 0
+
+            # for sent_i, length in enumerate(accumulate_length):
+            while sent_i < len(accumulate_length):
+                length = accumulate_length[sent_i]
+                right_index = length + 1
+                sent_right_index = sent_i + 1
+                if right_index - left_index >= self.max_seq_length - 1 or right_index == example_total_num_tokens:
+                    samples_boundary.append([left_index, right_index])
+
+                    sample_input_ids = [
+                        self.tokenizer.cls_token_id
+                    ] + example_input_ids[left_index:right_index]
+                    sample_input_ids = sample_input_ids[:self.max_seq_length]
+
+                    sample_token_type_ids = [
+                        0
+                    ] + example_token_type_ids[left_index:right_index]
+                    sample_token_type_ids = sample_token_type_ids[:self.
+                                                                  max_seq_length]
+
+                    sample_attention_mask = [
+                        1
+                    ] + example_attention_mask[left_index:right_index]
+                    sample_attention_mask = sample_attention_mask[:self.
+                                                                  max_seq_length]
+
+                    sample_segment_ids = [
+                        0
+                    ] + example_segment_ids[left_index:right_index]
+                    sample_segment_ids = sample_segment_ids[:self.
+                                                            max_seq_length]
+
+                    sample_token_seq_labels = [
+                        -100
+                    ] + example_token_seq_labels[left_index:right_index]
+                    sample_token_seq_labels = sample_token_seq_labels[:self.
+                                                                      max_seq_length]
+
+                    if sent_right_index - 1 == sent_left_index:
+                        left_index = right_index
+                        sample_input_ids[-1] = self.tokenizer.eos_token_id
+                        sample_token_seq_labels[-1] = -100
+                    else:
+                        left_index = accumulate_length[sent_i - 1] + 1
+                        if sample_token_seq_labels[-1] != -100:
+                            sample_token_seq_labels[-1] = -100
+
+                    if sent_right_index - 1 == sent_left_index or right_index == example_total_num_tokens:
+                        sample_sentences = example_sentences[
+                            sent_left_index:sent_right_index]
+                        sent_left_index = sent_right_index
+                        sent_i += 1
+                    else:
+                        sample_sentences = example_sentences[
+                            sent_left_index:sent_right_index - 1]
+                        sent_left_index = sent_right_index - 1
+
+                    if (len([_ for _ in sample_token_seq_labels if _ != -100
+                             ])) != len(sample_sentences) - 1 and (len([
+                                 _
+                                 for _ in sample_token_seq_labels if _ != -100
+                             ])) != len(sample_sentences):
+                        tmp = []
+                        for w_i, w, l in zip(
+                                sample_input_ids,
+                                self.tokenizer.decode(sample_input_ids).split(
+                                    ' '), sample_token_seq_labels):
+                            tmp.append((w_i, w, l))
+                    while len(sample_input_ids) < self.max_seq_length:
+                        sample_input_ids.append(self.tokenizer.pad_token_id)
+                        sample_token_type_ids.append(0)
+                        sample_attention_mask.append(0)
+                        sample_segment_ids.append(example_total_num_sentences
+                                                  + 1)
+                        sample_token_seq_labels.append(-100)
+
+                    new_input_ids.append(sample_input_ids)
+                    new_token_type_ids.append(sample_token_type_ids)
+                    new_attention_mask.append(sample_attention_mask)
+                    new_segment_ids.append(sample_segment_ids)
+                    new_token_seq_labels.append(sample_token_seq_labels)
+                    new_example_ids.append(example_id)
+                    new_sentences.append(sample_sentences)
+                else:
+                    sent_i += 1
+                    continue
+
+        output_samples = {}
+
+        output_samples['input_ids'] = new_input_ids
+        output_samples['token_type_ids'] = new_token_type_ids
+        output_samples['attention_mask'] = new_attention_mask
+
+        output_samples['segment_ids'] = new_segment_ids
+        output_samples['example_id'] = new_example_ids
+        output_samples['labels'] = new_token_seq_labels
+        output_samples['sentences'] = new_sentences
+
+        return output_samples
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.fill_mask_ponet)
+class FillMaskPoNetPreprocessor(NLPTokenizerPreprocessorBase):
+    """The tokenizer preprocessor used in MLM task.
+    """
+
+    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
+        kwargs['truncation'] = kwargs.get('truncation', True)
+        kwargs['padding'] = kwargs.get('padding', 'max_length')
+        kwargs['max_length'] = kwargs.pop('sequence_length', 512)
+        kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids',
+                                                     True)
+        super().__init__(model_dir, pair=False, mode=mode, **kwargs)
+
+        self.cfg = Config.from_file(
+            osp.join(model_dir, ModelFile.CONFIGURATION))
+        self.language = self.cfg.model.get('language', 'en')
+        if self.language == 'en':
+            from nltk.tokenize import sent_tokenize
+            import_external_nltk_data(
+                osp.join(model_dir, 'nltk_data'), 'tokenizers/punkt')
+        elif self.language in ['zh', 'cn']:
+
+            def sent_tokenize(para):
+                para = re.sub(r'([。！!？\?])([^”’])', r'\1\n\2', para)  # noqa *
+                para = re.sub(r'(\.{6})([^”’])', r'\1\n\2', para)  # noqa *
+                para = re.sub(r'(\…{2})([^”’])', r'\1\n\2', para)  # noqa *
+                para = re.sub(r'([。！？\?][”’])([^，。！？\?])', r'\1\n\2',
+                              para)  # noqa *
+                para = para.rstrip()
+                return [_ for _ in para.split('\n') if _]
+        else:
+            raise NotImplementedError
+
+        self.sent_tokenize = sent_tokenize
+        self.max_length = kwargs['max_length']
+
+    def __call__(self, data: Union[str, Tuple, Dict]) -> Dict[str, Any]:
+        """process the raw input data
+
+        Args:
+            data (tuple): [sentence1, sentence2]
+                sentence1 (str): a sentence
+                    Example:
+                        'you are so handsome.'
+                sentence2 (str): a sentence
+                    Example:
+                        'you are so beautiful.'
+        Returns:
+            Dict[str, Any]: the preprocessed data
+        """
+
+        text_a, text_b, labels = self.parse_text_and_label(data)
+        output = self.tokenizer(
+            text_a,
+            text_b,
+            return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None,
+            **self.tokenize_kwargs)
+        max_seq_length = self.max_length
+
+        if text_b is None:
+            segment_ids = []
+            seg_lens = list(
+                map(
+                    len,
+                    self.tokenizer(
+                        self.sent_tokenize(text_a),
+                        add_special_tokens=False,
+                        truncation=True)['input_ids']))
+            segment_id = [0] + sum(
+                [[i] * sl for i, sl in enumerate(seg_lens, start=1)], [])
+            segment_id = segment_id[:max_seq_length - 1]
+            segment_ids.append(segment_id + [segment_id[-1] + 1]
+                               * (max_seq_length - len(segment_id)))
+            output['segment_ids'] = segment_ids
+
+        output = {
+            k: np.array(v) if isinstance(v, list) else v
+            for k, v in output.items()
+        }
+
+        self.labels_to_id(labels, output)
+        return output
diff --git a/modelscope/preprocessors/nlp/text_error_correction.py b/modelscope/preprocessors/nlp/text_error_correction.py
new file mode 100644
index 00000000..357a946f
--- /dev/null
+++ b/modelscope/preprocessors/nlp/text_error_correction.py
@@ -0,0 +1,50 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os.path as osp
+from typing import Any, Dict
+
+from modelscope.metainfo import Preprocessors
+from modelscope.preprocessors.base import Preprocessor
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.utils.constant import Fields
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.text_error_correction)
+class TextErrorCorrectionPreprocessor(Preprocessor):
+    """The preprocessor used in text correction task.
+    """
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        from fairseq.data import Dictionary
+        """preprocess the data via the vocab file from the `model_dir` path
+
+        Args:
+            model_dir (str): model path
+        """
+        super().__init__(*args, **kwargs)
+        self.vocab = Dictionary.load(osp.join(model_dir, 'dict.src.txt'))
+
+    def __call__(self, data: str) -> Dict[str, Any]:
+        """process the raw input data
+
+        Args:
+            data (str): a sentence
+                Example:
+                    '随着中国经济突飞猛近，建造工业与日俱增'
+        Returns:
+            Dict[str, Any]: the preprocessed data
+            Example:
+            {'net_input':
+                {'src_tokens':tensor([1,2,3,4]),
+                'src_lengths': tensor([4])}
+            }
+        """
+
+        text = ' '.join([x for x in data])
+        inputs = self.vocab.encode_line(
+            text, append_eos=True, add_if_not_exist=False)
+        lengths = inputs.size()
+        sample = dict()
+        sample['net_input'] = {'src_tokens': inputs, 'src_lengths': lengths}
+        return sample
diff --git a/modelscope/preprocessors/slp.py b/modelscope/preprocessors/slp.py
deleted file mode 100644
index d9c2d9b7..00000000
--- a/modelscope/preprocessors/slp.py
+++ /dev/null
@@ -1,223 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-
-from typing import Any, Dict
-
-from transformers import BertTokenizerFast
-
-from modelscope.metainfo import Preprocessors
-from modelscope.utils.constant import Fields
-from modelscope.utils.hub import get_model_type, parse_label_mapping
-from modelscope.utils.type_assert import type_assert
-from .base import Preprocessor
-from .builder import PREPROCESSORS
-
-__all__ = ['DocumentSegmentationPreprocessor']
-
-
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.document_segmentation)
-class DocumentSegmentationPreprocessor(Preprocessor):
-
-    def __init__(self, model_dir: str, config, *args, **kwargs):
-        """preprocess the data
-
-        Args:
-            model_dir (str): model path
-        """
-
-        super().__init__(*args, **kwargs)
-
-        self.tokenizer = BertTokenizerFast.from_pretrained(
-            model_dir,
-            use_fast=True,
-        )
-        self.question_column_name = 'labels'
-        self.context_column_name = 'sentences'
-        self.example_id_column_name = 'example_id'
-        self.label_to_id = {'B-EOP': 0, 'O': 1}
-        self.target_specical_ids = set()
-        self.target_specical_ids.add(self.tokenizer.eos_token_id)
-        self.max_seq_length = config.max_position_embeddings
-        self.label_list = ['B-EOP', 'O']
-
-    def __call__(self, examples) -> Dict[str, Any]:
-        questions = examples[self.question_column_name]
-        contexts = examples[self.context_column_name]
-        example_ids = examples[self.example_id_column_name]
-        num_examples = len(questions)
-
-        sentences = []
-        for sentence_list in contexts:
-            sentence_list = [_ + '[EOS]' for _ in sentence_list]
-            sentences.append(sentence_list)
-
-        try:
-            tokenized_examples = self.tokenizer(
-                sentences,
-                is_split_into_words=True,
-                add_special_tokens=False,
-                return_token_type_ids=True,
-                return_attention_mask=True,
-            )
-        except Exception as e:
-            print(str(e))
-            return {}
-
-        segment_ids = []
-        token_seq_labels = []
-        for example_index in range(num_examples):
-            example_input_ids = tokenized_examples['input_ids'][example_index]
-            example_labels = questions[example_index]
-            example_labels = [
-                self.label_to_id[_] if _ in self.label_to_id else -100
-                for _ in example_labels
-            ]
-            example_token_labels = []
-            segment_id = []
-            cur_seg_id = 1
-            for token_index in range(len(example_input_ids)):
-                if example_input_ids[token_index] in self.target_specical_ids:
-                    example_token_labels.append(example_labels[cur_seg_id - 1])
-                    segment_id.append(cur_seg_id)
-                    cur_seg_id += 1
-                else:
-                    example_token_labels.append(-100)
-                    segment_id.append(cur_seg_id)
-
-            segment_ids.append(segment_id)
-            token_seq_labels.append(example_token_labels)
-
-        tokenized_examples['segment_ids'] = segment_ids
-        tokenized_examples['token_seq_labels'] = token_seq_labels
-
-        new_segment_ids = []
-        new_token_seq_labels = []
-        new_input_ids = []
-        new_token_type_ids = []
-        new_attention_mask = []
-        new_example_ids = []
-        new_sentences = []
-
-        for example_index in range(num_examples):
-            example_input_ids = tokenized_examples['input_ids'][example_index]
-            example_token_type_ids = tokenized_examples['token_type_ids'][
-                example_index]
-            example_attention_mask = tokenized_examples['attention_mask'][
-                example_index]
-            example_segment_ids = tokenized_examples['segment_ids'][
-                example_index]
-            example_token_seq_labels = tokenized_examples['token_seq_labels'][
-                example_index]
-            example_sentences = contexts[example_index]
-            example_id = example_ids[example_index]
-            example_total_num_sentences = len(questions[example_index])
-            example_total_num_tokens = len(
-                tokenized_examples['input_ids'][example_index])
-            accumulate_length = [
-                i for i, x in enumerate(tokenized_examples['input_ids']
-                                        [example_index])
-                if x == self.tokenizer.eos_token_id
-            ]
-            samples_boundary = []
-            left_index = 0
-            sent_left_index = 0
-            sent_i = 0
-
-            # for sent_i, length in enumerate(accumulate_length):
-            while sent_i < len(accumulate_length):
-                length = accumulate_length[sent_i]
-                right_index = length + 1
-                sent_right_index = sent_i + 1
-                if right_index - left_index >= self.max_seq_length - 1 or right_index == example_total_num_tokens:
-                    samples_boundary.append([left_index, right_index])
-
-                    sample_input_ids = [
-                        self.tokenizer.cls_token_id
-                    ] + example_input_ids[left_index:right_index]
-                    sample_input_ids = sample_input_ids[:self.max_seq_length]
-
-                    sample_token_type_ids = [
-                        0
-                    ] + example_token_type_ids[left_index:right_index]
-                    sample_token_type_ids = sample_token_type_ids[:self.
-                                                                  max_seq_length]
-
-                    sample_attention_mask = [
-                        1
-                    ] + example_attention_mask[left_index:right_index]
-                    sample_attention_mask = sample_attention_mask[:self.
-                                                                  max_seq_length]
-
-                    sample_segment_ids = [
-                        0
-                    ] + example_segment_ids[left_index:right_index]
-                    sample_segment_ids = sample_segment_ids[:self.
-                                                            max_seq_length]
-
-                    sample_token_seq_labels = [
-                        -100
-                    ] + example_token_seq_labels[left_index:right_index]
-                    sample_token_seq_labels = sample_token_seq_labels[:self.
-                                                                      max_seq_length]
-
-                    if sent_right_index - 1 == sent_left_index:
-                        left_index = right_index
-                        sample_input_ids[-1] = self.tokenizer.eos_token_id
-                        sample_token_seq_labels[-1] = -100
-                    else:
-                        left_index = accumulate_length[sent_i - 1] + 1
-                        if sample_token_seq_labels[-1] != -100:
-                            sample_token_seq_labels[-1] = -100
-
-                    if sent_right_index - 1 == sent_left_index or right_index == example_total_num_tokens:
-                        sample_sentences = example_sentences[
-                            sent_left_index:sent_right_index]
-                        sent_left_index = sent_right_index
-                        sent_i += 1
-                    else:
-                        sample_sentences = example_sentences[
-                            sent_left_index:sent_right_index - 1]
-                        sent_left_index = sent_right_index - 1
-
-                    if (len([_ for _ in sample_token_seq_labels if _ != -100
-                             ])) != len(sample_sentences) - 1 and (len([
-                                 _
-                                 for _ in sample_token_seq_labels if _ != -100
-                             ])) != len(sample_sentences):
-                        tmp = []
-                        for w_i, w, l in zip(
-                                sample_input_ids,
-                                self.tokenizer.decode(sample_input_ids).split(
-                                    ' '), sample_token_seq_labels):
-                            tmp.append((w_i, w, l))
-                    while len(sample_input_ids) < self.max_seq_length:
-                        sample_input_ids.append(self.tokenizer.pad_token_id)
-                        sample_token_type_ids.append(0)
-                        sample_attention_mask.append(0)
-                        sample_segment_ids.append(example_total_num_sentences
-                                                  + 1)
-                        sample_token_seq_labels.append(-100)
-
-                    new_input_ids.append(sample_input_ids)
-                    new_token_type_ids.append(sample_token_type_ids)
-                    new_attention_mask.append(sample_attention_mask)
-                    new_segment_ids.append(sample_segment_ids)
-                    new_token_seq_labels.append(sample_token_seq_labels)
-                    new_example_ids.append(example_id)
-                    new_sentences.append(sample_sentences)
-                else:
-                    sent_i += 1
-                    continue
-
-        output_samples = {}
-
-        output_samples['input_ids'] = new_input_ids
-        output_samples['token_type_ids'] = new_token_type_ids
-        output_samples['attention_mask'] = new_attention_mask
-
-        output_samples['segment_ids'] = new_segment_ids
-        output_samples['example_id'] = new_example_ids
-        output_samples['labels'] = new_token_seq_labels
-        output_samples['sentences'] = new_sentences
-
-        return output_samples
diff --git a/modelscope/preprocessors/space/__init__.py b/modelscope/preprocessors/space/__init__.py
index f216287b..b484dabe 100644
--- a/modelscope/preprocessors/space/__init__.py
+++ b/modelscope/preprocessors/space/__init__.py
@@ -4,6 +4,7 @@ from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
+    from .data_loader import DataLoader
     from .dialog_intent_prediction_preprocessor import \
         DialogIntentPredictionPreprocessor
     from .dialog_modeling_preprocessor import DialogModelingPreprocessor
@@ -13,6 +14,7 @@ if TYPE_CHECKING:
 
 else:
     _import_structure = {
+        'data_loader': ['DataLoader'],
         'dialog_intent_prediction_preprocessor':
         ['DialogIntentPredictionPreprocessor'],
         'dialog_modeling_preprocessor': ['DialogModelingPreprocessor'],
diff --git a/modelscope/preprocessors/space/args.py b/modelscope/preprocessors/space/args.py
new file mode 100644
index 00000000..d9e91e74
--- /dev/null
+++ b/modelscope/preprocessors/space/args.py
@@ -0,0 +1,66 @@
+"""
+Parse argument.
+"""
+
+import argparse
+
+import json
+
+
+def str2bool(v):
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Unsupported value encountered.')
+
+
+class HParams(dict):
+    """ Hyper-parameters class
+
+    Store hyper-parameters in training / infer / ... scripts.
+    """
+
+    def __getattr__(self, name):
+        if name in self.keys():
+            return self[name]
+        for v in self.values():
+            if isinstance(v, HParams):
+                if name in v:
+                    return v[name]
+        raise AttributeError(f"'HParams' object has no attribute '{name}'")
+
+    def __setattr__(self, name, value):
+        self[name] = value
+
+    def save(self, filename):
+        with open(filename, 'w', encoding='utf-8') as fp:
+            json.dump(self, fp, ensure_ascii=False, indent=4, sort_keys=False)
+
+    def load(self, filename):
+        with open(filename, 'r', encoding='utf-8') as fp:
+            params_dict = json.load(fp)
+        for k, v in params_dict.items():
+            if isinstance(v, dict):
+                self[k].update(HParams(v))
+            else:
+                self[k] = v
+
+
+def parse_args(parser):
+    """ Parse hyper-parameters from cmdline. """
+    parsed = parser.parse_args()
+    args = HParams()
+    optional_args = parser._action_groups[1]
+    for action in optional_args._group_actions[1:]:
+        arg_name = action.dest
+        args[arg_name] = getattr(parsed, arg_name)
+    for group in parser._action_groups[2:]:
+        group_args = HParams()
+        for action in group._group_actions:
+            arg_name = action.dest
+            group_args[arg_name] = getattr(parsed, arg_name)
+        if len(group_args) > 0:
+            args[group.title] = group_args
+    return args
diff --git a/modelscope/preprocessors/space/batch.py b/modelscope/preprocessors/space/batch.py
new file mode 100644
index 00000000..fe0ad0ec
--- /dev/null
+++ b/modelscope/preprocessors/space/batch.py
@@ -0,0 +1,55 @@
+def batch(reader, batch_size, drop_last=False):
+    """
+    This operator creates a batched reader which combines the data from the
+    input reader to batched data.
+
+    Args:
+        reader(generator): the data reader to read from.
+        batch_size(int): size of each mini-batch.
+        drop_last(bool, optional): If set to True, the last batch is dropped when
+            the size of last batch is not equal to batch_size, if set to False,
+            it will not. Default: False.
+    Returns:
+        The batched reader.
+
+    Return Type:
+        generator
+
+    Examples:
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+            def reader():
+                for i in range(10):
+                    yield i
+            batch_reader = fluid.io.batch(reader, batch_size=2)
+
+            for data in batch_reader():
+                print(data)
+
+            # Output is
+            # [0, 1]
+            # [2, 3]
+            # [4, 5]
+            # [6, 7]
+            # [8, 9]
+    """
+
+    def batch_reader():
+        r = reader()
+        b = []
+        for instance in r:
+            b.append(instance)
+            if len(b) == batch_size:
+                yield b
+                b = []
+        if drop_last is False and len(b) != 0:
+            yield b
+
+    # Batch size check
+    batch_size = int(batch_size)
+    if batch_size <= 0:
+        raise ValueError('batch_size should be a positive integeral value, '
+                         'but got batch_size={}'.format(batch_size))
+
+    return batch_reader
diff --git a/modelscope/preprocessors/space/data_loader.py b/modelscope/preprocessors/space/data_loader.py
new file mode 100644
index 00000000..bd04a79c
--- /dev/null
+++ b/modelscope/preprocessors/space/data_loader.py
@@ -0,0 +1,112 @@
+"""
+DataLoader class
+"""
+
+import math
+import os
+
+import numpy as np
+
+from modelscope.preprocessors.space.args import str2bool
+from modelscope.preprocessors.space.batch import batch
+from modelscope.preprocessors.space.lazy_dataset import LazyDataset
+from modelscope.preprocessors.space.sampler import (RandomSampler,
+                                                    SequentialSampler,
+                                                    SortedSampler)
+
+
+def get_data_loader(batch_size, reader, hparams, file, collate_fn, is_test):
+    assert os.path.exists(file), f"{file} doesn't exist"
+    dataset = LazyDataset(file, reader=reader)
+    data_loader = DataLoader(
+        dataset,
+        batch_size,
+        hparams.Trainer,
+        collate_fn=collate_fn,
+        is_test=is_test)
+    return data_loader
+
+
+def get_sequential_data_loader(batch_size, reader, hparams, data_paths,
+                               collate_fn, data_type):
+    data_loaders = []
+    for data_path in data_paths:
+        file = os.path.join(
+            data_path,
+            f'{data_type}.{hparams.BPETextField.tokenizer_type}.jsonl')
+        data_loaders.append(
+            get_data_loader(
+                batch_size=batch_size,
+                reader=reader,
+                hparams=hparams,
+                file=file,
+                collate_fn=collate_fn,
+                is_test=(data_type != 'train')))
+    data_loader = SequentialDataLoaderWrapper(data_loaders)
+    return data_loader
+
+
+class DataLoader(object):
+    """ Implement of DataLoader. """
+
+    @classmethod
+    def add_cmdline_argument(cls, group):
+        group.add_argument('--shuffle', type=str2bool, default=True)
+        group.add_argument('--sort_pool_size', type=int, default=0)
+        return group
+
+    def __init__(self,
+                 dataset,
+                 batch_size,
+                 hparams,
+                 collate_fn=None,
+                 sampler=None,
+                 is_test=False):
+        self.dataset = dataset
+        self.collate_fn = collate_fn
+        self.gpu = hparams.gpu
+        self.sort_pool_size = hparams.sort_pool_size
+
+        if sampler is None:
+            if hparams.shuffle and not is_test:
+                sampler = RandomSampler(dataset)
+            else:
+                sampler = SequentialSampler(dataset)
+
+        if self.sort_pool_size > 0 and not is_test:
+            sampler = SortedSampler(sampler, self.sort_pool_size)
+
+        def reader():
+            for idx in sampler:
+                yield idx
+
+        drop_last = False if self.gpu <= 1 or is_test else True
+        self.reader = batch(reader, batch_size=batch_size, drop_last=drop_last)
+        self.num_batches = math.floor(len(dataset) / batch_size) if drop_last \
+            else math.ceil(len(dataset) / batch_size)
+
+    def __len__(self):
+        return self.num_batches
+
+    def __iter__(self):
+        for batch_indices in self.reader():
+            samples = [self.dataset[idx] for idx in batch_indices]
+            yield self.collate_fn(samples)
+
+
+class SequentialDataLoaderWrapper:
+
+    def __init__(self, data_loaders):
+        self.data_loaders = data_loaders
+        self.data_file_to_dataset = {
+            data_loader.dataset.data_file: data_loader.dataset
+            for data_loader in self.data_loaders
+        }
+
+    def __iter__(self):
+        for data_loader in self.data_loaders:
+            for tmp_batch in data_loader:
+                yield data_loader.dataset.data_file, tmp_batch
+
+    def __len__(self):
+        return np.sum([len(data_loader) for data_loader in self.data_loaders])
diff --git a/modelscope/preprocessors/space/dialog_modeling_preprocessor.py b/modelscope/preprocessors/space/dialog_modeling_preprocessor.py
index a2157c2b..c461ade1 100644
--- a/modelscope/preprocessors/space/dialog_modeling_preprocessor.py
+++ b/modelscope/preprocessors/space/dialog_modeling_preprocessor.py
@@ -35,7 +35,7 @@ class DialogModelingPreprocessor(Preprocessor):
         self.config.use_gpu = self.config.use_gpu and torch.cuda.is_available()
 
         self.text_field = MultiWOZBPETextField(
-            self.model_dir, config=self.config)
+            config=self.config, model_dir=self.model_dir)
 
     @type_assert(object, Dict)
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
diff --git a/modelscope/preprocessors/space/fields/gen_field.py b/modelscope/preprocessors/space/fields/gen_field.py
index 5bff360f..32346bd5 100644
--- a/modelscope/preprocessors/space/fields/gen_field.py
+++ b/modelscope/preprocessors/space/fields/gen_field.py
@@ -2,9 +2,11 @@
 
 import os
 import random
+from asyncio import constants
 from collections import OrderedDict
 from itertools import chain
 
+import json
 import numpy as np
 
 from modelscope.preprocessors.space.tokenizer import Tokenizer
@@ -117,7 +119,8 @@ class BPETextField(object):
         return self.tokenizer.convert_tokens_to_ids([self.eos_d_token])[0]
 
     def __init__(self, config):
-        self.gpu = 0
+        self.train, self.dev, self.test = [], [], []
+        self.gpu = config.Trainer.gpu
         self.tokenizer = None
         self.vocab = None
         self.db = None
@@ -249,13 +252,9 @@ class BPETextField(object):
         for dial in data:
             batch.append(dial)
             if len(batch) == self.batch_size:
-                # print('batch size: %d, batch num +1'%(len(batch)))
                 all_batches.append(batch)
                 batch = []
-        # if remainder > 1/2 batch_size, just put them in the previous batch, otherwise form a new batch
-        # print('last batch size: %d, batch num +1'%(len(batch)))
-        # if (len(batch) % len(cfg.cuda_device)) != 0:
-        #     batch = batch[:-(len(batch) % len(cfg.cuda_device))]
+
         # TODO deal with deleted data
         if self.gpu <= 1:
             if len(batch) > 0.5 * self.batch_size:
@@ -308,7 +307,7 @@ class BPETextField(object):
 
 class MultiWOZBPETextField(BPETextField):
 
-    def __init__(self, model_dir, config):
+    def __init__(self, config, **kwargs):
         super(MultiWOZBPETextField, self).__init__(config)
 
         import spacy
@@ -327,8 +326,12 @@ class MultiWOZBPETextField(BPETextField):
                 )
         self.nlp = spacy.load('en_core_web_sm')
 
+        if config.do_train:
+            db_dir = kwargs['data_dir']
+        else:
+            db_dir = kwargs['model_dir']
         self.db = MultiWozDB(
-            model_dir, {
+            db_dir, {
                 'attraction': 'db/attraction_db_processed.json',
                 'hospital': 'db/hospital_db_processed.json',
                 'hotel': 'db/hotel_db_processed.json',
@@ -337,14 +340,14 @@ class MultiWOZBPETextField(BPETextField):
                 'taxi': 'db/taxi_db_processed.json',
                 'train': 'db/train_db_processed.json',
             })
-        self._build_vocab(model_dir)
+        self._build_vocab(db_dir)
 
         special_tokens = [
             self.pad_token, self.bos_token, self.eos_token, self.unk_token
         ]
         special_tokens.extend(self.add_sepcial_tokens())
         self.tokenizer = Tokenizer(
-            vocab_path=os.path.join(model_dir, ModelFile.VOCAB_FILE),
+            vocab_path=os.path.join(kwargs['model_dir'], ModelFile.VOCAB_FILE),
             special_tokens=special_tokens,
             tokenizer_type=config.BPETextField.tokenizer_type)
         self.understand_ids = self.tokenizer.convert_tokens_to_ids(
@@ -352,6 +355,26 @@ class MultiWOZBPETextField(BPETextField):
         self.policy_ids = self.tokenizer.convert_tokens_to_ids(
             self.policy_tokens)
 
+        if config.do_train:
+            test_list = [
+                line.strip().lower() for line in open(
+                    os.path.join(kwargs['data_dir'], 'testListFile.json'),
+                    'r').readlines()
+            ]
+            dev_list = [
+                line.strip().lower() for line in open(
+                    os.path.join(kwargs['data_dir'], 'valListFile.json'),
+                    'r').readlines()
+            ]
+
+            self.dev_files, self.test_files = {}, {}
+            for fn in test_list:
+                self.test_files[fn.replace('.json', '')] = 1
+            for fn in dev_list:
+                self.dev_files[fn.replace('.json', '')] = 1
+
+            self._load_data(kwargs['data_dir'])
+
         return
 
     def get_ids(self, data: str):
@@ -414,7 +437,6 @@ class MultiWOZBPETextField(BPETextField):
         name_to_set = {'train': self.train, 'test': self.test, 'dev': self.dev}
         dial = name_to_set[set_name]
         turn_bucket = self._bucket_by_turn(dial)
-        # self._shuffle_turn_bucket(turn_bucket)
         all_batches = []
 
         if set_name not in self.set_stats:
@@ -433,19 +455,13 @@ class MultiWOZBPETextField(BPETextField):
             except Exception:
                 log_str += 'turn num:%d, dial num: %d, batch num: %d last batch len: %d\n' % (
                     k, len(turn_bucket[k]), len(batches), 0.0)
-            # print("turn num:%d, dial num:v%d, batch num: %d, "%(k, len(turn_bucket[k]), len(batches)))
+
             num_training_steps += k * len(batches)
             num_turns += k * len(turn_bucket[k])
             num_dials += len(turn_bucket[k])
             all_batches += batches
         log_str += 'total batch num: %d\n' % len(all_batches)
-        # print('total batch num: %d'%len(all_batches))
-        # print('dialog count: %d'%dia_count)
-        # return all_batches
 
-        # log stats
-        # logging.info(log_str)
-        # cfg.num_training_steps = num_training_steps * cfg.epoch_num
         self.set_stats[set_name][
             'num_training_steps_per_epoch'] = num_training_steps  # turn-level steps
         self.set_stats[set_name]['num_turns'] = num_turns
@@ -484,6 +500,71 @@ class MultiWOZBPETextField(BPETextField):
         self.vocab.load_vocab(vp)
         return self.vocab.vocab_size
 
+    def _load_data(self, data_dir, save_temp=True):
+        """
+        load processed data and encode, or load already encoded data
+        """
+
+        def load_data_from_resource(data_resource):
+            data = json.loads(
+                open(
+                    os.path.join(data_dir, data_resource),
+                    'r',
+                    encoding='utf-8').read().lower())
+            train, dev, test = [], [], []
+            for fn, dial in data.items():
+                if '.json' in fn:
+                    fn = fn.replace('.json', '')
+                if self.dev_files.get(fn):
+                    dev.append(self._get_encoded_data(fn, dial))
+                elif self.test_files.get(fn):
+                    test.append(self._get_encoded_data(fn, dial))
+                else:
+                    train.append(self._get_encoded_data(fn, dial))
+            return train, dev, test
+
+        data_processed = 'new_db_se_blank_encoded_domain.data.json'
+        data_resource = 'data_for_damd.json'
+        if save_temp:  # save encoded data
+            # encoded: no sos, se_encoded: sos and eos
+            encoded_file = os.path.join(data_dir, data_processed)
+
+            if os.path.exists(encoded_file):
+                logger.info(
+                    'Reading encoded data from {}'.format(encoded_file))
+                self.data = json.loads(
+                    open(
+                        os.path.join(data_dir, data_resource),
+                        'r',
+                        encoding='utf-8').read().lower())
+                encoded_data = json.loads(
+                    open(encoded_file, 'r', encoding='utf-8').read())
+                self.train = encoded_data['train']
+                self.dev = encoded_data['dev']
+                self.test = encoded_data['test']
+            else:
+                logger.info(
+                    'Encoding data now and save the encoded data in {}'.format(
+                        encoded_file))
+                # not exists, encode data and save
+                self.train, self.dev, self.test = load_data_from_resource(
+                    data_resource)
+                # save encoded data
+                encoded_data = {
+                    'train': self.train,
+                    'dev': self.dev,
+                    'test': self.test
+                }
+                json.dump(encoded_data, open(encoded_file, 'w'), indent=2)
+        else:  # directly read processed data and encode
+            self.train, self.dev, self.test = load_data_from_resource(
+                data_resource)
+
+        random.seed(10)
+        random.shuffle(self.train)
+        logger.info('train size:{}, dev size:{}, test size:{}'.format(
+            len(self.train), len(self.dev), len(self.test)))
+
     def _get_convert_str(self, sent):
         assert isinstance(sent, str)
         return ' '.join([
@@ -491,14 +572,65 @@ class MultiWOZBPETextField(BPETextField):
             for tok in sent.split()
         ])
 
+    def _get_encoded_data(self, fn, dial):
+        encoded_dial = []
+        for idx, t in enumerate(dial['log']):  # tokenize to list of ids
+            enc = {}
+            enc['dial_id'] = fn
+
+            enc_info_list = [
+                ('user', self.sos_u_id, 'user', self.eos_u_id),
+                ('usdx', self.sos_u_id, 'user', self.eos_u_id),
+                ('resp', self.sos_r_id, 'resp', self.eos_r_id),
+                ('bspn', self.sos_b_id, 'constraint', self.eos_b_id),
+                ('bsdx', self.sos_b_id, 'cons_delex', self.eos_b_id),
+                ('aspn', self.sos_a_id, 'sys_act', self.eos_a_id)
+            ]
+            for enc_key, start_token, item_key, end_token in enc_info_list:
+                enc[enc_key] = [
+                    start_token
+                ] + self.tokenizer.convert_tokens_to_ids(
+                    self.tokenizer.tokenize(
+                        self._get_convert_str(t[item_key]))) + [end_token]
+
+            enc['turn_num'] = t['turn_num']
+
+            if idx > 0 and t['turn_domain'] == '[general]':
+                enc['dspn'] = encoded_dial[idx - 1]['dspn']
+                enc['pointer'] = encoded_dial[idx - 1]['pointer'][:4] + [
+                    int(i) for i in t['pointer'].split(',')
+                ][-2:]
+                enc['turn_domain'] = encoded_dial[idx - 1]['turn_domain']
+                enc['db'] = encoded_dial[idx - 1]['db']
+            else:
+                if t['turn_domain'] == '[general]':
+                    assert not t['constraint'], f'{fn}-{idx}'
+                enc['dspn'] = [
+                    self.sos_d_id
+                ] + self.tokenizer.convert_tokens_to_ids(
+                    self.tokenizer.tokenize(
+                        self._get_convert_str(
+                            t['turn_domain']))) + [self.eos_d_id]
+                enc['pointer'] = [int(i) for i in t['pointer'].split(',')]
+                enc['turn_domain'] = t['turn_domain'].split()
+                db_pointer = self.bspan_to_DBpointer(t['constraint'],
+                                                     t['turn_domain'].split())
+                enc['db'] = [
+                    self.sos_db_id
+                ] + self.tokenizer.convert_tokens_to_ids(
+                    self.tokenizer.tokenize(
+                        self._get_convert_str(db_pointer))) + [self.eos_db_id]
+
+            encoded_dial.append(enc)
+        return encoded_dial
+
     def bspan_to_DBpointer(self, bspan, turn_domain):
         constraint_dict = self.bspan_to_constraint_dict(bspan)
-        # print(constraint_dict)
         matnums = self.db.get_match_num(constraint_dict)
         match_dom = turn_domain[0] if len(turn_domain) == 1 else turn_domain[1]
         match_dom = match_dom[1:-1] if match_dom.startswith('[') else match_dom
         match = matnums[match_dom]
-        # vector = self.db.addDBPointer(match_dom, match)
+
         vector = self.db.addDBIndicator(match_dom, match)
         return vector
 
@@ -691,3 +823,67 @@ class MultiWOZBPETextField(BPETextField):
                 inputs['labels'] = [context]  # use previous turn
 
         return inputs, prompt_id
+
+    def restore(self, resp, domain, constraint_dict, mat_ents):
+        restored = resp
+
+        restored = restored.replace('[value_reference]', '53022')
+        restored = restored.replace('[value_car]', 'BMW')
+
+        for d in domain:
+            constraint = constraint_dict.get(d, None)
+            if constraint:
+                replace_res_list = [('stay', '[value_stay]'),
+                                    ('day', '[value_day]'),
+                                    ('people', '[value_people]'),
+                                    ('time', '[value_time]'),
+                                    ('type', '[value_type]')]
+                for key, value_key in replace_res_list:
+                    if key in constraint:
+                        restored = restored.replace(value_key, constraint[key])
+
+                if d in mat_ents and len(mat_ents[d]) == 0:
+                    for s in constraint:
+                        if s == 'pricerange' and d in [
+                                'hotel', 'restaurant'
+                        ] and 'price]' in restored:
+                            restored = restored.replace(
+                                '[value_price]', constraint['pricerange'])
+                        if s + ']' in restored:
+                            restored = restored.replace(
+                                '[value_%s]' % s, constraint[s])
+
+            if '[value_choice' in restored and mat_ents.get(d):
+                restored = restored.replace('[value_choice]',
+                                            str(len(mat_ents[d])))
+        if '[value_choice' in restored:
+            restored = restored.replace('[value_choice]', '3')
+
+        try:
+            ent = mat_ents.get(domain[-1], [])
+            if ent:
+                ent = ent[0]
+
+                for t in restored.split():
+                    if '[value' in t:
+                        slot = t[7:-1]
+                        if ent.get(slot):
+                            if domain[-1] == 'hotel' and slot == 'price':
+                                slot = 'pricerange'
+                            restored = restored.replace(t, ent[slot])
+                        elif slot == 'price':
+                            if ent.get('pricerange'):
+                                restored = restored.replace(
+                                    t, ent['pricerange'])
+                            else:
+                                logger.info(restored, domain)
+        except Exception:
+            logger.error(resp)
+            logger.error(restored)
+            quit()
+
+        restored = restored.replace('[value_phone]', '62781111')
+        restored = restored.replace('[value_postcode]', 'CG9566')
+        restored = restored.replace('[value_address]', 'Parkside, Cambridge')
+
+        return restored
diff --git a/modelscope/preprocessors/space/fields/intent_field.py b/modelscope/preprocessors/space/fields/intent_field.py
index dc00e677..6d3b5fff 100644
--- a/modelscope/preprocessors/space/fields/intent_field.py
+++ b/modelscope/preprocessors/space/fields/intent_field.py
@@ -791,7 +791,6 @@ class BPETextField(object):
                                 user_or_sys = [self.sos_r_id]
                             tmp = [self.sos_u_id
                                    ] + self.numericalize(s) + user_or_sys
-                            tmp = tmp + self.numericalize(s) + [self.eos_r_id]
                             new_src.append(tmp)
 
                         src_span_mask = [[0] + list(map(int, s)) + [0]
diff --git a/modelscope/preprocessors/space/lazy_dataset.py b/modelscope/preprocessors/space/lazy_dataset.py
new file mode 100644
index 00000000..8da21db7
--- /dev/null
+++ b/modelscope/preprocessors/space/lazy_dataset.py
@@ -0,0 +1,47 @@
+"""
+Dataset class
+"""
+
+import json
+
+from modelscope.preprocessors.space.args import str2bool
+
+
+class LazyDataset(object):
+    """
+    Lazy load dataset from disk.
+
+    Each line of data file is a preprocessed example.
+    """
+
+    def __init__(self, data_file, reader, transform=lambda s: json.loads(s)):
+        """
+        Initialize lazy dataset.
+
+        By default, loading .jsonl format.
+
+        :param data_file
+        :type str
+
+        :param transform
+        :type callable
+        """
+        self.data_file = data_file
+        self.transform = transform
+        self.reader = reader
+        self.offsets = [0]
+        with open(data_file, 'r', encoding='utf-8') as fp:
+            while fp.readline() != '':
+                self.offsets.append(fp.tell())
+        self.offsets.pop()
+        self.fp = open(data_file, 'r', encoding='utf-8')
+
+    def __len__(self):
+        return len(self.offsets)
+
+    def __getitem__(self, idx):
+        self.fp.seek(self.offsets[idx], 0)
+        sample = self.transform(self.fp.readline().strip())
+        if self.reader.with_mlm:
+            sample = self.reader.create_token_masked_lm_predictions(sample)
+        return sample
diff --git a/modelscope/preprocessors/space/preprocess.py b/modelscope/preprocessors/space/preprocess.py
new file mode 100644
index 00000000..bd8d64d1
--- /dev/null
+++ b/modelscope/preprocessors/space/preprocess.py
@@ -0,0 +1,48 @@
+"""
+Preprocess script.
+"""
+
+import glob
+import os
+
+from modelscope.preprocessors.space.args import parse_args
+from modelscope.preprocessors.space.fields.intent_field import \
+    IntentBPETextField
+
+FILE_NAME = 'train.json'
+
+
+def intent_preprocess(path, cfg):
+
+    bpe = IntentBPETextField(path, cfg)
+    args = cfg.Dataset
+    build_examples_fn = bpe.build_examples_multi_turn if args.trigger_role == 'system' \
+        else bpe.build_examples_single_turn
+    build_score_matrix_fn = bpe.build_score_matrix
+    build_score_matrix_multiprocessing_fn = bpe.build_score_matrix_multiprocessing
+    data_paths = list(
+        os.path.dirname(c) for c in sorted(
+            glob.glob(args.data_dir + '/**/' + FILE_NAME, recursive=True)))
+    data_paths = bpe.filter_data_path(data_paths=data_paths)
+
+    for mode in ['train', 'valid', 'test']:
+        for data_path in data_paths:
+            input_file = os.path.join(data_path, f'{mode}.json')
+            output_file = os.path.join(data_path,
+                                       f'{mode}.{bpe.tokenizer_type}.jsonl')
+            output_score_file = os.path.join(data_path, f'{mode}.Score.npy')
+            if os.path.exists(input_file) and not os.path.exists(output_file):
+                examples = build_examples_fn(input_file, data_type=mode)
+                if examples:
+                    bpe.save_examples(examples, output_file)
+                else:
+                    continue
+            if os.path.exists(output_file) and not os.path.exists(output_score_file) and \
+                    not args.dynamic_score and 'AnPreDial' in data_path:
+                examples = bpe.load_examples(output_file)
+                if args.num_process >= 2:
+                    score_matrix = build_score_matrix_multiprocessing_fn(
+                        examples)
+                else:
+                    score_matrix = build_score_matrix_fn(examples)
+                bpe.save_examples(score_matrix, output_score_file)
diff --git a/modelscope/preprocessors/space/sampler.py b/modelscope/preprocessors/space/sampler.py
new file mode 100644
index 00000000..49a216d1
--- /dev/null
+++ b/modelscope/preprocessors/space/sampler.py
@@ -0,0 +1,75 @@
+"""
+Sampler class.
+"""
+
+import numpy as np
+
+
+class Sampler(object):
+
+    def __init__(self):
+        return
+
+    def __len__(self):
+        raise NotImplementedError
+
+    def __iter__(self):
+        raise NotImplementedError
+
+
+class SequentialSampler(Sampler):
+
+    def __init__(self, dataset):
+        self.dataset = dataset
+        return
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def __iter__(self):
+        return iter(range(len(self)))
+
+
+class RandomSampler(Sampler):
+
+    def __init__(self, dataset):
+        self.dataset = dataset
+        self.epoch = 0
+        return
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def __iter__(self):
+        np.random.seed(self.epoch)
+        self.epoch += 1
+        return iter(np.random.permutation(len(self)))
+
+
+class SortedSampler(Sampler):
+    """ Sorted Sampler.
+    Sort each block of examples by key.
+    """
+
+    def __init__(self, sampler, sort_pool_size, key='src'):
+        self.sampler = sampler
+        self.sort_pool_size = sort_pool_size
+        self.key = lambda idx: len(self.sampler.dataset[idx][key])
+        return
+
+    def __len__(self):
+        return len(self.sampler)
+
+    def __iter__(self):
+        pool = []
+        for idx in self.sampler:
+            pool.append(idx)
+            if len(pool) == self.sort_pool_size:
+                pool = sorted(pool, key=self.key)
+                for i in pool:
+                    yield i
+                pool = []
+        if len(pool) > 0:
+            pool = sorted(pool, key=self.key)
+            for i in pool:
+                yield i
diff --git a/modelscope/preprocessors/star/__init__.py b/modelscope/preprocessors/star/__init__.py
index 5a4bcea9..cef8f074 100644
--- a/modelscope/preprocessors/star/__init__.py
+++ b/modelscope/preprocessors/star/__init__.py
@@ -6,7 +6,8 @@ from modelscope.utils.import_utils import LazyImportModule
 if TYPE_CHECKING:
     from .conversational_text_to_sql_preprocessor import \
         ConversationalTextToSqlPreprocessor
-    from .fields import MultiWOZBPETextField, IntentBPETextField
+    from .fields import (get_label, SubPreprocessor, preprocess_dataset,
+                         process_dataset)
 
 else:
     _import_structure = {
diff --git a/modelscope/preprocessors/star/fields/__init__.py b/modelscope/preprocessors/star/fields/__init__.py
index 1e95a998..7049c43b 100644
--- a/modelscope/preprocessors/star/fields/__init__.py
+++ b/modelscope/preprocessors/star/fields/__init__.py
@@ -1,6 +1,30 @@
-from modelscope.preprocessors.star.fields.common_utils import SubPreprocessor
-from modelscope.preprocessors.star.fields.parse import get_label
-from modelscope.preprocessors.star.fields.preprocess_dataset import \
-    preprocess_dataset
-from modelscope.preprocessors.star.fields.process_dataset import \
-    process_dataset
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .common_utils import SubPreprocessor
+    from .parse import get_label
+    from .preprocess_dataset import \
+        preprocess_dataset
+    from .process_dataset import \
+        process_dataset, process_tables
+
+else:
+    _import_structure = {
+        'common_utils': ['SubPreprocessor'],
+        'parse': ['get_label'],
+        'preprocess_dataset': ['preprocess_dataset'],
+        'process_dataset': ['process_dataset', 'process_tables'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/preprocessors/star3/__init__.py b/modelscope/preprocessors/star3/__init__.py
new file mode 100644
index 00000000..9aa562d7
--- /dev/null
+++ b/modelscope/preprocessors/star3/__init__.py
@@ -0,0 +1,24 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .table_question_answering_preprocessor import TableQuestionAnsweringPreprocessor
+    from .fields import MultiWOZBPETextField, IntentBPETextField
+
+else:
+    _import_structure = {
+        'table_question_answering_preprocessor':
+        ['TableQuestionAnsweringPreprocessor'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/preprocessors/star3/fields/__init__.py b/modelscope/preprocessors/star3/fields/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/preprocessors/star3/fields/database.py b/modelscope/preprocessors/star3/fields/database.py
new file mode 100644
index 00000000..a99800cf
--- /dev/null
+++ b/modelscope/preprocessors/star3/fields/database.py
@@ -0,0 +1,77 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import json
+import tqdm
+
+from modelscope.preprocessors.star3.fields.struct import Trie
+
+
+class Database:
+
+    def __init__(self, tokenizer, table_file_path, syn_dict_file_path):
+        self.tokenizer = tokenizer
+        self.tables = self.init_tables(table_file_path=table_file_path)
+        self.syn_dict = self.init_syn_dict(
+            syn_dict_file_path=syn_dict_file_path)
+
+    def init_tables(self, table_file_path):
+        tables = {}
+        lines = []
+        with open(table_file_path, 'r') as fo:
+            for line in fo:
+                lines.append(line)
+
+        for line in tqdm.tqdm(lines, desc='Load Tables'):
+            table = json.loads(line.strip())
+
+            table_header_length = 0
+            headers_tokens = []
+            for header in table['header_name']:
+                header_tokens = self.tokenizer.tokenize(header)
+                table_header_length += len(header_tokens)
+                headers_tokens.append(header_tokens)
+            empty_column = self.tokenizer.tokenize('空列')
+            table_header_length += len(empty_column)
+            headers_tokens.append(empty_column)
+            table['tablelen'] = table_header_length
+            table['header_tok'] = headers_tokens
+
+            table['header_types'].append('null')
+            table['header_units'] = [
+                self.tokenizer.tokenize(unit) for unit in table['header_units']
+            ] + [[]]
+
+            trie_set = [Trie() for _ in table['header_name']]
+            for row in table['rows']:
+                for ii, cell in enumerate(row):
+                    if 'real' in table['header_types'][ii].lower() or \
+                        'number' in table['header_types'][ii].lower() or \
+                            'duration' in table['header_types'][ii].lower():
+                        continue
+                    word = str(cell).strip().lower()
+                    trie_set[ii].insert(word, word)
+
+            table['value_trie'] = trie_set
+            tables[table['table_id']] = table
+
+        return tables
+
+    def init_syn_dict(self, syn_dict_file_path):
+        lines = []
+        with open(syn_dict_file_path, encoding='utf-8') as fo:
+            for line in fo:
+                lines.append(line)
+
+        syn_dict = {}
+        for line in tqdm.tqdm(lines, desc='Load Synonym Dict'):
+            tokens = line.strip().split('\t')
+            if len(tokens) != 2:
+                continue
+            keys = tokens[0].strip().split('|')
+            values = tokens[1].strip().split('|')
+            for key in keys:
+                key = key.lower().strip()
+                syn_dict.setdefault(key, [])
+                for value in values:
+                    syn_dict[key].append(value.lower().strip())
+
+        return syn_dict
diff --git a/modelscope/preprocessors/star3/fields/schema_link.py b/modelscope/preprocessors/star3/fields/schema_link.py
new file mode 100644
index 00000000..40613f78
--- /dev/null
+++ b/modelscope/preprocessors/star3/fields/schema_link.py
@@ -0,0 +1,423 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import re
+
+from modelscope.preprocessors.star3.fields.struct import TypeInfo
+
+
+class SchemaLinker:
+
+    def __init__(self):
+        pass
+
+    def find_in_list(self, comlist, words):
+        result = False
+        for com in comlist:
+            if words in com:
+                result = True
+                break
+        return result
+
+    def get_continue_score(self, pstr, tstr):
+        comlist = []
+        minlen = min(len(pstr), len(tstr))
+        for slen in range(minlen, 1, -1):
+            for ts in range(0, len(tstr), 1):
+                if ts + slen > len(tstr):
+                    continue
+                words = tstr[ts:ts + slen]
+                if words in pstr and not self.find_in_list(comlist, words):
+                    comlist.append(words)
+
+        comlen = 0
+        for com in comlist:
+            comlen += len(com) * len(com)
+        weight = comlen / (len(tstr) * len(tstr) + 0.001)
+        if weight > 1.0:
+            weight = 1.0
+
+        return weight
+
+    def get_match_score(self, ptokens, ttokens):
+        pset = set(ptokens)
+        tset = set(ttokens)
+        comset = pset & tset
+        allset = pset | tset
+        weight2 = len(comset) / (len(allset) + 0.001)
+        weight3 = self.get_continue_score(''.join(ptokens), ''.join(ttokens))
+        return 0.4 * weight2 + 0.6 * weight3
+
+    def is_number(self, s):
+        try:
+            float(s)
+            return True
+        except ValueError:
+            pass
+
+        try:
+            import unicodedata
+            unicodedata.numeric(s)
+            return True
+        except (TypeError, ValueError):
+            pass
+
+        return False
+
+    def get_match_phrase(self, query, target):
+        if target in query:
+            return target, 1.0
+
+        qtokens = []
+        for i in range(0, len(query), 1):
+            qtokens.append(query[i:i + 1])
+        ttokens = []
+        for i in range(0, len(target), 1):
+            ttokens.append(target[i:i + 1])
+        ttok_set = set(ttokens)
+
+        phrase = ''
+        score = 0.0
+        for qidx, qword in enumerate(qtokens):
+            if qword not in ttok_set:
+                continue
+
+            eidx = (qidx + 2 * len(ttokens)) if (
+                len(qtokens) > qidx + 2 * len(ttokens)) else len(qtokens)
+            while eidx > qidx:
+                ptokens = qtokens[qidx:eidx]
+                weight = self.get_match_score(ptokens, ttokens)
+                if weight + 0.001 > score:
+                    score = weight
+                    phrase = ''.join(ptokens)
+                eidx -= 1
+
+        if self.is_number(target) and phrase != target:
+            score = 0.0
+        if len(phrase) > 1 and phrase in target:
+            score *= (1.0 + 0.05 * len(phrase))
+
+        return phrase, score
+
+    def allfindpairidx(self, que_tok, value_tok, weight):
+        idxs = []
+        for i in range(0, len(que_tok) - len(value_tok) + 1, 1):
+            s = i
+            e = i
+            matched = True
+            for j in range(0, len(value_tok), 1):
+                if value_tok[j].lower() == que_tok[i + j].lower():
+                    e = i + j
+                else:
+                    matched = False
+                    break
+            if matched:
+                idxs.append([s, e, weight])
+
+        return idxs
+
+    def findnear(self, ps1, pe1, ps2, pe2):
+        if abs(ps1 - pe2) <= 2 or abs(pe1 - ps2) <= 2:
+            return True
+        return False
+
+    def get_column_type(self, col_idx, table):
+        colType = table['header_types'][col_idx]
+        if 'number' in colType or 'duration' in colType or 'real' in colType:
+            colType = 'real'
+        elif 'date' in colType:
+            colType = 'date'
+        elif 'bool' in colType:
+            colType = 'bool'
+        else:
+            colType = 'text'
+
+        return colType
+
+    def add_type_all(self, typeinfos, index, idxs, label, linktype, value,
+                     orgvalue):
+        for idx in idxs:
+            info = TypeInfo(label, index, linktype, value, orgvalue, idx[0],
+                            idx[1], idx[2])
+            flag = True
+            for i, typeinfo in enumerate(typeinfos):
+                if info.pstart < typeinfo.pstart:
+                    typeinfos.insert(i, info)
+                    flag = False
+                    break
+
+            if flag:
+                typeinfos.append(info)
+
+        return typeinfos
+
+    def save_info(self, tinfo, sinfo):
+        flag = True
+        if tinfo.pstart > sinfo.pend or tinfo.pend < sinfo.pstart:
+            pass
+        elif tinfo.pstart >= sinfo.pstart and \
+                tinfo.pend <= sinfo.pend and tinfo.index == -1:
+            flag = False
+        elif tinfo.pstart == sinfo.pstart and sinfo.pend == tinfo.pend and \
+                abs(tinfo.weight - sinfo.weight) < 0.01:
+            pass
+        else:
+            if sinfo.label == 'col' or sinfo.label == 'val':
+                if tinfo.label == 'col' or tinfo.label == 'val':
+                    if (sinfo.pend
+                            - sinfo.pstart) > (tinfo.pend - tinfo.pstart) or (
+                                sinfo.weight > tinfo.weight
+                                and sinfo.index != -1):
+                        flag = False
+                else:
+                    flag = False
+            else:
+                if (tinfo.label == 'op' or tinfo.label == 'agg'):
+                    if (sinfo.pend - sinfo.pstart) > (
+                            tinfo.pend
+                            - tinfo.pstart) or sinfo.weight > tinfo.weight:
+                        flag = False
+
+        return flag
+
+    def normal_type_infos(self, infos):
+        typeinfos = []
+        for info in infos:
+            typeinfos = [x for x in typeinfos if self.save_info(x, info)]
+            flag = True
+            for i, typeinfo in enumerate(typeinfos):
+                if not self.save_info(info, typeinfo):
+                    flag = False
+                    break
+                if info.pstart < typeinfo.pstart:
+                    typeinfos.insert(i, info)
+                    flag = False
+                    break
+            if flag:
+                typeinfos.append(info)
+        return typeinfos
+
+    def findnear_typeinfo(self, info1, info2):
+        return self.findnear(info1.pstart, info1.pend, info2.pstart,
+                             info2.pend)
+
+    def find_real_column(self, infos, table):
+        for i, vinfo in enumerate(infos):
+            if vinfo.index != -1 or vinfo.label != 'val':
+                continue
+            eoidx = -1
+            for j, oinfo in enumerate(infos):
+                if oinfo.label != 'op':
+                    continue
+                if self.findnear_typeinfo(vinfo, oinfo):
+                    eoidx = j
+                    break
+            for j, cinfo in enumerate(infos):
+                if cinfo.label != 'col' or table['header_types'][
+                        cinfo.index] != 'real':
+                    continue
+                if self.findnear_typeinfo(cinfo, vinfo) or (
+                        eoidx != -1
+                        and self.findnear_typeinfo(cinfo, infos[eoidx])):
+                    infos[i].index = cinfo.index
+                    break
+
+        return infos
+
+    def filter_column_infos(self, infos):
+        delid = []
+        for i, info in enumerate(infos):
+            if info.label != 'col':
+                continue
+            for j in range(i + 1, len(infos), 1):
+                if infos[j].label == 'col' and \
+                        info.pstart == infos[j].pstart and \
+                        info.pend == infos[j].pend:
+                    delid.append(i)
+                    delid.append(j)
+                    break
+
+        typeinfos = []
+        for idx, info in enumerate(infos):
+            if idx in set(delid):
+                continue
+            typeinfos.append(info)
+
+        return typeinfos
+
+    def filter_type_infos(self, infos, table):
+        infos = self.filter_column_infos(infos)
+        infos = self.find_real_column(infos, table)
+
+        colvalMp = {}
+        for info in infos:
+            if info.label == 'col':
+                colvalMp[info.index] = []
+        for info in infos:
+            if info.label == 'val' and info.index in colvalMp:
+                colvalMp[info.index].append(info)
+
+        delid = []
+        for idx, info in enumerate(infos):
+            if info.label != 'val' or info.index in colvalMp:
+                continue
+            for index in colvalMp.keys():
+                valinfos = colvalMp[index]
+                for valinfo in valinfos:
+                    if valinfo.pstart <= info.pstart and \
+                            valinfo.pend >= info.pend:
+                        delid.append(idx)
+                        break
+
+        typeinfos = []
+        for idx, info in enumerate(infos):
+            if idx in set(delid):
+                continue
+            typeinfos.append(info)
+
+        return typeinfos
+
+    def get_table_match_score(self, nlu_t, schema_link):
+        match_len = 0
+        for info in schema_link:
+            scale = 0.6
+            if info['question_len'] > 0 and info['column_index'] != -1:
+                scale = 1.0
+            else:
+                scale = 0.5
+            match_len += scale * info['question_len'] * info['weight']
+
+        return match_len / (len(nlu_t) + 0.1)
+
+    def get_entity_linking(self, tokenizer, nlu, nlu_t, tables, col_syn_dict):
+        """
+        get linking between question and schema column
+        """
+        typeinfos = []
+        numbers = re.findall(r'[-]?\d*\.\d+|[-]?\d+|\d+', nlu)
+
+        # search schema link in every table
+        search_result_list = []
+        for tablename in tables:
+            table = tables[tablename]
+            trie_set = None
+            if 'value_trie' in table:
+                trie_set = table['value_trie']
+
+            typeinfos = []
+            for ii, column in enumerate(table['header_name']):
+                column = column.lower()
+                column_new = re.sub('(.*?)', '', column)
+                column_new = re.sub('（.*?）', '', column_new)
+                cphrase, cscore = self.get_match_phrase(
+                    nlu.lower(), column_new)
+                if cscore > 0.3 and cphrase.strip() != '':
+                    phrase_tok = tokenizer.tokenize(cphrase)
+                    cidxs = self.allfindpairidx(nlu_t, phrase_tok, cscore)
+                    typeinfos = self.add_type_all(typeinfos, ii, cidxs, 'col',
+                                                  'column', cphrase, column)
+                if cscore < 0.8 and column_new in col_syn_dict:
+                    columns = list(set(col_syn_dict[column_new]))
+                    for syn_col in columns:
+                        if syn_col not in nlu.lower() or syn_col == '':
+                            continue
+                        phrase_tok = tokenizer.tokenize(syn_col)
+                        cidxs = self.allfindpairidx(nlu_t, phrase_tok, 1.0)
+                        typeinfos = self.add_type_all(typeinfos, ii, cidxs,
+                                                      'col', 'column', syn_col,
+                                                      column)
+
+            for ii, trie in enumerate(trie_set):
+                ans = trie.match(nlu.lower())
+                for cell in ans.keys():
+                    vphrase = cell
+                    vscore = 1.0
+                    # print("trie_set find:", cell, ans[cell])
+                    phrase_tok = tokenizer.tokenize(vphrase)
+                    if len(phrase_tok) == 0 or len(vphrase) < 2:
+                        continue
+                    vidxs = self.allfindpairidx(nlu_t, phrase_tok, vscore)
+                    linktype = self.get_column_type(ii, table)
+                    typeinfos = self.add_type_all(typeinfos, ii, vidxs, 'val',
+                                                  linktype, vphrase, ans[cell])
+
+            for number in set(numbers):
+                number_tok = tokenizer.tokenize(number.lower())
+                if len(number_tok) == 0:
+                    continue
+                nidxs = self.allfindpairidx(nlu_t, number_tok, 1.0)
+                typeinfos = self.add_type_all(typeinfos, -1, nidxs, 'val',
+                                              'real', number, number)
+
+            newtypeinfos = self.normal_type_infos(typeinfos)
+
+            newtypeinfos = self.filter_type_infos(newtypeinfos, table)
+
+            final_question = [0] * len(nlu_t)
+            final_header = [0] * len(table['header_name'])
+            for typeinfo in newtypeinfos:
+                pstart = typeinfo.pstart
+                pend = typeinfo.pend + 1
+                if typeinfo.label == 'op' or typeinfo.label == 'agg':
+                    score = int(typeinfo.linktype[-1])
+                    if typeinfo.label == 'op':
+                        score += 6
+                    else:
+                        score += 11
+                    for i in range(pstart, pend, 1):
+                        final_question[i] = score
+
+                elif typeinfo.label == 'col':
+                    for i in range(pstart, pend, 1):
+                        final_question[i] = 4
+                    if final_header[typeinfo.index] % 2 == 0:
+                        final_header[typeinfo.index] += 1
+
+                elif typeinfo.label == 'val':
+                    if typeinfo.index == -1:
+                        for i in range(pstart, pend, 1):
+                            final_question[i] = 5
+                    else:
+                        for i in range(pstart, pend, 1):
+                            final_question[i] = 2
+                        final_question[pstart] = 1
+                        final_question[pend - 1] = 3
+                        if final_header[typeinfo.index] < 2:
+                            final_header[typeinfo.index] += 2
+
+            # collect schema_link
+            schema_link = []
+            for sl in newtypeinfos:
+                if sl.label in ['val', 'col']:
+                    schema_link.append({
+                        'question_len':
+                        max(0, sl.pend - sl.pstart + 1),
+                        'question_index': [sl.pstart, sl.pend],
+                        'question_span':
+                        ''.join(nlu_t[sl.pstart:sl.pend + 1]),
+                        'column_index':
+                        sl.index,
+                        'column_span':
+                        table['header_name'][sl.index]
+                        if sl.index != -1 else '空列',
+                        'label':
+                        sl.label,
+                        'weight':
+                        round(sl.weight, 4)
+                    })
+
+            # get the match score of each table
+            match_score = self.get_table_match_score(nlu_t, schema_link)
+
+            search_result = {
+                'table_id': table['table_id'],
+                'question_knowledge': final_question,
+                'header_knowledge': final_header,
+                'schema_link': schema_link,
+                'match_score': match_score
+            }
+            search_result_list.append(search_result)
+
+        search_result_list = sorted(
+            search_result_list, key=lambda x: x['match_score'],
+            reverse=True)[0:4]
+
+        return search_result_list
diff --git a/modelscope/preprocessors/star3/fields/struct.py b/modelscope/preprocessors/star3/fields/struct.py
new file mode 100644
index 00000000..3c2e664b
--- /dev/null
+++ b/modelscope/preprocessors/star3/fields/struct.py
@@ -0,0 +1,181 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+cond_ops = ['>', '<', '==', '!=', 'ASC', 'DESC']
+agg_ops = [
+    '', 'AVG', 'MAX', 'MIN', 'COUNT', 'SUM', 'COMPARE', 'GROUP BY', 'SAME'
+]
+conn_ops = ['', 'AND', 'OR']
+
+
+class Context:
+
+    def __init__(self):
+        self.history_sql = None
+
+    def set_history_sql(self, sql):
+        self.history_sql = sql
+
+
+class SQLQuery:
+
+    def __init__(self, string, query, sql_result):
+        self.string = string
+        self.query = query
+        self.sql_result = sql_result
+
+
+class TrieNode(object):
+
+    def __init__(self):
+        """
+        Initialize your data structure here.
+        """
+        self.data = {}
+        self.is_word = False
+        self.term = None
+
+
+class Trie(object):
+
+    def __init__(self):
+        self.root = TrieNode()
+
+    def insert(self, word, term):
+        """
+        Inserts a word into the trie.
+        :type word: str
+        :rtype: void
+        """
+        node = self.root
+        for letter in word:
+            child = node.data.get(letter)
+            if not child:
+                node.data[letter] = TrieNode()
+            node = node.data[letter]
+        node.is_word = True
+        node.term = term
+
+    def search(self, word):
+        """
+        Returns if the word is in the trie.
+        :type word: str
+        :rtype: bool
+        """
+        node = self.root
+        for letter in word:
+            node = node.data.get(letter)
+            if not node:
+                return None, False
+        return node.term, True
+
+    def match(self, query):
+        start = 0
+        end = 1
+        length = len(query)
+        ans = {}
+        while start < length and end < length:
+            sub = query[start:end]
+            term, flag = self.search(sub)
+            if flag:
+                if term is not None:
+                    ans[sub] = term
+                end += 1
+            else:
+                start += 1
+                end = start + 1
+        return ans
+
+    def starts_with(self, prefix):
+        """
+        Returns if there is any word in the trie
+        that starts with the given prefix.
+        :type prefix: str
+        :rtype: bool
+        """
+        node = self.root
+        for letter in prefix:
+            node = node.data.get(letter)
+            if not node:
+                return False
+        return True
+
+    def get_start(self, prefix):
+        """
+        Returns words started with prefix
+        :param prefix:
+        :return: words (list)
+        """
+
+        def _get_key(pre, pre_node):
+            words_list = []
+            if pre_node.is_word:
+                words_list.append(pre)
+            for x in pre_node.data.keys():
+                words_list.extend(_get_key(pre + str(x), pre_node.data.get(x)))
+            return words_list
+
+        words = []
+        if not self.starts_with(prefix):
+            return words
+        if self.search(prefix):
+            words.append(prefix)
+            return words
+        node = self.root
+        for letter in prefix:
+            node = node.data.get(letter)
+        return _get_key(prefix, node)
+
+
+class TypeInfo:
+
+    def __init__(self, label, index, linktype, value, orgvalue, pstart, pend,
+                 weight):
+        self.label = label
+        self.index = index
+        self.linktype = linktype
+        self.value = value
+        self.orgvalue = orgvalue
+        self.pstart = pstart
+        self.pend = pend
+        self.weight = weight
+
+
+class Constant:
+
+    def __init__(self):
+        self.action_ops = [
+            'add_cond', 'change_cond', 'del_cond', 'change_focus_total',
+            'change_agg_only', 'del_focus', 'restart', 'switch_table',
+            'out_of_scripts', 'repeat', 'firstTurn'
+        ]
+
+        self.agg_ops = [
+            '', 'AVG', 'MAX', 'MIN', 'COUNT', 'SUM', 'COMPARE', 'GROUP BY',
+            'SAME'
+        ]
+
+        self.cond_ops = ['>', '<', '==', '!=', 'ASC', 'DESC']
+
+        self.cond_conn_ops = ['', 'AND', 'OR']
+
+        self.col_type_dict = {
+            'null': 0,
+            'text': 1,
+            'number': 2,
+            'duration': 3,
+            'bool': 4,
+            'date': 5
+        }
+
+        self.schema_link_dict = {
+            'col_start': 1,
+            'col_middle': 2,
+            'col_end': 3,
+            'val_start': 4,
+            'val_middle': 5,
+            'val_end': 6
+        }
+
+        self.max_select_num = 4
+
+        self.max_where_num = 6
diff --git a/modelscope/preprocessors/star3/table_question_answering_preprocessor.py b/modelscope/preprocessors/star3/table_question_answering_preprocessor.py
new file mode 100644
index 00000000..163759a1
--- /dev/null
+++ b/modelscope/preprocessors/star3/table_question_answering_preprocessor.py
@@ -0,0 +1,118 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from typing import Any, Dict
+
+import torch
+from transformers import BertTokenizer
+
+from modelscope.metainfo import Preprocessors
+from modelscope.preprocessors.base import Preprocessor
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.preprocessors.star3.fields.database import Database
+from modelscope.preprocessors.star3.fields.schema_link import SchemaLinker
+from modelscope.utils.config import Config
+from modelscope.utils.constant import Fields, ModelFile
+from modelscope.utils.type_assert import type_assert
+
+__all__ = ['TableQuestionAnsweringPreprocessor']
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp,
+    module_name=Preprocessors.table_question_answering_preprocessor)
+class TableQuestionAnsweringPreprocessor(Preprocessor):
+
+    def __init__(self, model_dir: str, db: Database = None, *args, **kwargs):
+        """preprocess the data
+
+        Args:
+            model_dir (str): model path
+            db (Database): database instance
+        """
+        super().__init__(*args, **kwargs)
+
+        self.model_dir: str = model_dir
+        self.config = Config.from_file(
+            os.path.join(self.model_dir, ModelFile.CONFIGURATION))
+
+        # read tokenizer
+        self.tokenizer = BertTokenizer(
+            os.path.join(self.model_dir, ModelFile.VOCAB_FILE))
+
+        # read database
+        if db is None:
+            self.db = Database(
+                tokenizer=self.tokenizer,
+                table_file_path=os.path.join(self.model_dir, 'table.json'),
+                syn_dict_file_path=os.path.join(self.model_dir, 'synonym.txt'))
+        else:
+            self.db = db
+
+        # get schema linker
+        self.schema_linker = SchemaLinker()
+
+        # set device
+        self.device = 'cuda' if \
+            ('device' not in kwargs or kwargs['device'] == 'gpu') \
+            and torch.cuda.is_available() else 'cpu'
+
+    def construct_data(self, search_result_list, nlu, nlu_t, db, history_sql):
+        datas = []
+        for search_result in search_result_list:
+            data = {}
+            data['table_id'] = search_result['table_id']
+            data['question'] = nlu
+            data['question_tok'] = nlu_t
+            data['header_tok'] = db.tables[data['table_id']]['header_tok']
+            data['types'] = db.tables[data['table_id']]['header_types']
+            data['units'] = db.tables[data['table_id']]['header_units']
+            data['action'] = 0
+            data['sql'] = None
+            data['history_sql'] = history_sql
+            data['wvi_corenlp'] = []
+            data['bertindex_knowledge'] = search_result['question_knowledge']
+            data['header_knowledge'] = search_result['header_knowledge']
+            data['schema_link'] = search_result['schema_link']
+            datas.append(data)
+
+        return datas
+
+    @type_assert(object, dict)
+    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        """process the raw input data
+
+        Args:
+            data (dict):
+                utterance: a sentence
+                last_sql: predicted sql of last utterance
+                Example:
+                    utterance: 'Which of these are hiring?'
+                    last_sql: ''
+
+        Returns:
+            Dict[str, Any]: the preprocessed data
+        """
+
+        # tokenize question
+        question = data['question']
+        history_sql = data['history_sql']
+        nlu = question.lower()
+        nlu_t = self.tokenizer.tokenize(nlu)
+
+        # get linking
+        search_result_list = self.schema_linker.get_entity_linking(
+            tokenizer=self.tokenizer,
+            nlu=nlu,
+            nlu_t=nlu_t,
+            tables=self.db.tables,
+            col_syn_dict=self.db.syn_dict)
+
+        # collect data
+        datas = self.construct_data(
+            search_result_list=search_result_list[0:1],
+            nlu=nlu,
+            nlu_t=nlu_t,
+            db=self.db,
+            history_sql=history_sql)
+
+        return {'datas': datas}
diff --git a/modelscope/trainers/__init__.py b/modelscope/trainers/__init__.py
index 8f8938c8..a632642a 100644
--- a/modelscope/trainers/__init__.py
+++ b/modelscope/trainers/__init__.py
@@ -11,7 +11,7 @@ if TYPE_CHECKING:
                      ImagePortraitEnhancementTrainer,
                      MovieSceneSegmentationTrainer)
     from .multi_modal import CLIPTrainer
-    from .nlp import SequenceClassificationTrainer
+    from .nlp import SequenceClassificationTrainer, PassageRankingTrainer
     from .nlp_trainer import NlpEpochBasedTrainer, VecoTrainer
     from .trainer import EpochBasedTrainer
 
@@ -25,7 +25,7 @@ else:
             'ImagePortraitEnhancementTrainer', 'MovieSceneSegmentationTrainer'
         ],
         'multi_modal': ['CLIPTrainer'],
-        'nlp': ['SequenceClassificationTrainer'],
+        'nlp': ['SequenceClassificationTrainer', 'PassageRankingTrainer'],
         'nlp_trainer': ['NlpEpochBasedTrainer', 'VecoTrainer'],
         'trainer': ['EpochBasedTrainer']
     }
diff --git a/modelscope/trainers/easycv/__init__.py b/modelscope/trainers/easycv/__init__.py
index e69de29b..b1b8fc15 100644
--- a/modelscope/trainers/easycv/__init__.py
+++ b/modelscope/trainers/easycv/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .utils import AddLrLogHook, EasyCVMetric
+else:
+    _import_structure = {'utils': ['AddLrLogHook', 'EasyCVMetric']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/trainers/easycv/trainer.py b/modelscope/trainers/easycv/trainer.py
index dee06a41..3c869495 100644
--- a/modelscope/trainers/easycv/trainer.py
+++ b/modelscope/trainers/easycv/trainer.py
@@ -27,7 +27,6 @@ class EasyCVEpochBasedTrainer(EpochBasedTrainer):
     """Epoch based Trainer for EasyCV.
 
     Args:
-        task: Task name.
         cfg_file(str): The config file of EasyCV.
         model (:obj:`torch.nn.Module` or :obj:`TorchModel` or `str`): The model to be run, or a valid model dir
             or a model id. If model is None, build_model method will be called.
@@ -51,7 +50,6 @@ class EasyCVEpochBasedTrainer(EpochBasedTrainer):
 
     def __init__(
             self,
-            task: str,
             cfg_file: Optional[str] = None,
             model: Optional[Union[TorchModel, nn.Module, str]] = None,
             arg_parse_fn: Optional[Callable] = None,
@@ -64,7 +62,6 @@ class EasyCVEpochBasedTrainer(EpochBasedTrainer):
             model_revision: Optional[str] = DEFAULT_MODEL_REVISION,
             **kwargs):
 
-        self.task = task
         register_util.register_parallel()
         register_util.register_part_mmcv_hooks_to_ms()
 
@@ -168,8 +165,3 @@ class EasyCVEpochBasedTrainer(EpochBasedTrainer):
             device_ids=[torch.cuda.current_device()])
 
         return build_parallel(dp_cfg)
-
-    def rebuild_config(self, cfg: Config):
-        cfg.task = self.task
-
-        return cfg
diff --git a/modelscope/trainers/easycv/utils/register_util.py b/modelscope/trainers/easycv/utils/register_util.py
index f80eaace..04bf719b 100644
--- a/modelscope/trainers/easycv/utils/register_util.py
+++ b/modelscope/trainers/easycv/utils/register_util.py
@@ -4,16 +4,49 @@ import logging
 
 from modelscope.trainers.hooks import HOOKS
 from modelscope.trainers.parallel.builder import PARALLEL
+from modelscope.utils.registry import default_group
+
+
+class _RegisterManager:
+
+    def __init__(self):
+        self.registries = {}
+
+    def add(self, module, name, group_key=default_group):
+        if module.name not in self.registries:
+            self.registries[module.name] = {}
+        if group_key not in self.registries[module.name]:
+            self.registries[module.name][group_key] = []
+
+        self.registries[module.name][group_key].append(name)
+
+    def exists(self, module, name, group_key=default_group):
+        if self.registries.get(module.name, None) is None:
+            return False
+        if self.registries[module.name].get(group_key, None) is None:
+            return False
+        if name in self.registries[module.name][group_key]:
+            return True
+
+        return False
+
+
+_dynamic_register = _RegisterManager()
 
 
 def register_parallel():
     from mmcv.parallel import MMDistributedDataParallel, MMDataParallel
 
-    PARALLEL.register_module(
-        module_name='MMDistributedDataParallel',
-        module_cls=MMDistributedDataParallel)
-    PARALLEL.register_module(
-        module_name='MMDataParallel', module_cls=MMDataParallel)
+    mmddp = 'MMDistributedDataParallel'
+    mmdp = 'MMDataParallel'
+
+    if not _dynamic_register.exists(PARALLEL, mmddp):
+        _dynamic_register.add(PARALLEL, mmddp)
+        PARALLEL.register_module(
+            module_name=mmddp, module_cls=MMDistributedDataParallel)
+    if not _dynamic_register.exists(PARALLEL, mmdp):
+        _dynamic_register.add(PARALLEL, mmdp)
+        PARALLEL.register_module(module_name=mmdp, module_cls=MMDataParallel)
 
 
 def register_hook_to_ms(hook_name, logger=None):
@@ -24,6 +57,10 @@ def register_hook_to_ms(hook_name, logger=None):
         raise ValueError(
             f'Not found hook "{hook_name}" in EasyCV hook registries!')
 
+    if _dynamic_register.exists(HOOKS, hook_name):
+        return
+    _dynamic_register.add(HOOKS, hook_name)
+
     obj = _EV_HOOKS._module_dict[hook_name]
     HOOKS.register_module(module_name=hook_name, module_cls=obj)
 
@@ -41,18 +78,19 @@ def register_part_mmcv_hooks_to_ms():
     from mmcv.runner.hooks import lr_updater
     from mmcv.runner.hooks import HOOKS as _MMCV_HOOKS
     from easycv.hooks import StepFixCosineAnnealingLrUpdaterHook, YOLOXLrUpdaterHook
-    from easycv.hooks.logger import PreLoggerHook
 
     mmcv_hooks_in_easycv = [('StepFixCosineAnnealingLrUpdaterHook',
                              StepFixCosineAnnealingLrUpdaterHook),
-                            ('YOLOXLrUpdaterHook', YOLOXLrUpdaterHook),
-                            ('PreLoggerHook', PreLoggerHook)]
+                            ('YOLOXLrUpdaterHook', YOLOXLrUpdaterHook)]
 
     members = inspect.getmembers(lr_updater)
     members.extend(mmcv_hooks_in_easycv)
 
     for name, obj in members:
         if name in _MMCV_HOOKS._module_dict:
+            if _dynamic_register.exists(HOOKS, name):
+                continue
+            _dynamic_register.add(HOOKS, name)
             HOOKS.register_module(
                 module_name=name,
                 module_cls=obj,
diff --git a/modelscope/trainers/hooks/checkpoint_hook.py b/modelscope/trainers/hooks/checkpoint_hook.py
index fcd8e982..220929b8 100644
--- a/modelscope/trainers/hooks/checkpoint_hook.py
+++ b/modelscope/trainers/hooks/checkpoint_hook.py
@@ -1,14 +1,16 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
+import random
 
-import json
+import numpy as np
+import torch
 
 from modelscope import __version__
 from modelscope.metainfo import Hooks
-from modelscope.utils.checkpoint import save_checkpoint
+from modelscope.utils.checkpoint import load_checkpoint, save_checkpoint
 from modelscope.utils.constant import LogKeys, ModelFile
 from modelscope.utils.logger import get_logger
-from modelscope.utils.torch_utils import is_master
+from modelscope.utils.torch_utils import get_dist_info, is_master
 from .builder import HOOKS
 from .hook import Hook
 from .priority import Priority
@@ -25,6 +27,7 @@ class CheckpointHook(Hook):
         save_optimizer (bool): Whether to save optimizer state dict.  Default: True.
         save_dir (str): The directory to save checkpoints. If is None, use `trainer.work_dir`
         save_last (bool): Whether to save the last checkpoint. Default: True.
+        checkpoint_file (str): The checkpoint file to be loaded.
     """
 
     PRIORITY = Priority.LOW
@@ -34,12 +37,16 @@ class CheckpointHook(Hook):
                  by_epoch=True,
                  save_optimizer=True,
                  save_dir=None,
-                 save_last=True):
+                 save_last=True,
+                 checkpoint_file=None):
         self.interval = interval
         self.by_epoch = by_epoch
         self.save_optimizer = save_optimizer
         self.save_dir = save_dir
+        self.checkpoint_file = checkpoint_file
         self.save_last = save_last
+        self.rng_state = None
+        self.need_load_rng_state = False
 
     def before_run(self, trainer):
         if not self.save_dir:
@@ -56,6 +63,34 @@ class CheckpointHook(Hook):
         if is_master():
             self.logger.info(f'Checkpoints will be saved to {self.save_dir}')
 
+        if self.checkpoint_file is not None and os.path.isfile(
+                self.checkpoint_file):
+            meta = self.load_checkpoint(self.checkpoint_file, trainer)
+            self.rng_state = meta.get('rng_state')
+            self.need_load_rng_state = True
+
+    def before_train_epoch(self, trainer):
+        if self.need_load_rng_state:
+            if self.rng_state is not None:
+                random.setstate(self.rng_state['random'])
+                np.random.set_state(self.rng_state['numpy'])
+                torch.random.set_rng_state(self.rng_state['cpu'])
+                if torch.cuda.is_available():
+                    torch.cuda.random.set_rng_state_all(self.rng_state['cuda'])
+                self.need_load_rng_state = False
+            else:
+                self.logger.warn(
+                    'Random state cannot be found in checkpoint file, '
+                    'this may cause a random data order or model initialization.'
+                )
+
+        self.rng_state = {
+            'random': random.getstate(),
+            'numpy': np.random.get_state(),
+            'cpu': torch.random.get_rng_state(),
+            'cuda': torch.cuda.get_rng_state_all(),
+        }
+
     def after_train_epoch(self, trainer):
         if not self.by_epoch:
             return
@@ -66,6 +101,39 @@ class CheckpointHook(Hook):
                     f'Saving checkpoint at {trainer.epoch + 1} epoch')
                 self._save_checkpoint(trainer)
 
+    @classmethod
+    def load_checkpoint(cls, filename, trainer):
+        from modelscope.trainers.parallel.utils import is_parallel
+        if is_parallel(trainer.model):
+            model = trainer.model.module
+        else:
+            model = trainer.model
+        meta = load_checkpoint(filename, model, trainer.optimizer,
+                               trainer.lr_scheduler)
+        trainer._epoch = meta.get('epoch', trainer._epoch)
+        trainer._iter = meta.get('iter', trainer._iter)
+        trainer._inner_iter = meta.get('inner_iter', trainer._inner_iter)
+
+        for i, hook in enumerate(trainer.hooks):
+            # hook: Hook
+            key = f'{hook.__class__}-{i}'
+            if key in meta and hasattr(hook, 'load_state_dict'):
+                hook.load_state_dict(meta[key])
+            else:
+                trainer.logger.warn(
+                    f'The state_dict of hook {hook.__class__} at index {i} is not found in the checkpoint file.'
+                )
+
+        version = meta.get('modelscope')
+        if version != __version__:
+            trainer.logger.warn(
+                f'The modelscope version of loaded checkpoint does not match the runtime version. '
+                f'The saved version: {version}, runtime version: {__version__}'
+            )
+        trainer.logger.warn(
+            f'Checkpoint {filename} saving time: {meta.get("time")}')
+        return meta
+
     def _save_checkpoint(self, trainer):
         if self.by_epoch:
             cur_save_name = os.path.join(
@@ -74,7 +142,22 @@ class CheckpointHook(Hook):
             cur_save_name = os.path.join(
                 self.save_dir, f'{LogKeys.ITER}_{trainer.iter + 1}.pth')
 
-        save_checkpoint(trainer.model, cur_save_name, trainer.optimizer)
+        meta = {
+            'epoch': trainer.epoch,
+            'iter': trainer.iter + 1,
+            'inner_iter': trainer.inner_iter + 1,
+            'rng_state': self.rng_state,
+        }
+        for i, hook in enumerate(trainer.hooks):
+            if hasattr(hook, 'state_dict'):
+                meta[f'{hook.__class__}-{i}'] = hook.state_dict()
+
+        save_checkpoint(
+            trainer.model,
+            cur_save_name,
+            trainer.optimizer,
+            trainer.lr_scheduler,
+            meta=meta)
         if (self.is_last_epoch(trainer)
                 and self.by_epoch) or (self.is_last_iter(trainer)
                                        and not self.by_epoch):
@@ -144,6 +227,7 @@ class BestCkptSaverHook(CheckpointHook):
                  by_epoch=True,
                  save_optimizer=True,
                  save_dir=None,
+                 save_file_name=None,
                  interval=0):
         assert rule in ['max', 'min'], 'Only support "max" or "min" rule now.'
         super().__init__(
@@ -156,6 +240,7 @@ class BestCkptSaverHook(CheckpointHook):
         self.rule = rule
         self._best_metric = None
         self._best_ckpt_file = None
+        self.save_file_name = save_file_name
 
     def _should_save(self, trainer):
         return self._is_best_metric(trainer.metric_values)
@@ -179,16 +264,44 @@ class BestCkptSaverHook(CheckpointHook):
         return False
 
     def _save_checkpoint(self, trainer):
-        if self.by_epoch:
-            cur_save_name = os.path.join(
-                self.save_dir,
-                f'best_{LogKeys.EPOCH}{trainer.epoch + 1}_{self.metric_key}{self._best_metric}.pth'
-            )
-        else:
-            cur_save_name = os.path.join(
-                self.save_dir,
-                f'best_{LogKeys.ITER}{trainer.iter + 1}_{self.metric_key}{self._best_metric}.pth'
-            )
-        save_checkpoint(trainer.model, cur_save_name, trainer.optimizer)
+        cur_save_name = self.save_file_name
+        if cur_save_name is None:
+            if self.by_epoch:
+                cur_save_name = os.path.join(
+                    self.save_dir,
+                    f'best_{LogKeys.EPOCH}{trainer.epoch + 1}_{self.metric_key}{self._best_metric}.pth'
+                )
+            else:
+                cur_save_name = os.path.join(
+                    self.save_dir,
+                    f'best_{LogKeys.ITER}{trainer.iter + 1}_{self.metric_key}{self._best_metric}.pth'
+                )
+
+        meta = {
+            'epoch': trainer.epoch,
+            'iter': trainer.iter + 1,
+            'inner_iter': trainer.inner_iter + 1,
+            'rng_state': self.rng_state,
+        }
+        for i, hook in enumerate(trainer.hooks):
+            meta[f'{hook.__class__}-{i}'] = hook.state_dict()
+
+        if os.path.isfile(cur_save_name):
+            os.remove(cur_save_name)
+        save_checkpoint(trainer.model, cur_save_name, trainer.optimizer,
+                        trainer.lr_scheduler, meta)
         self._best_ckpt_file = cur_save_name
         self._save_pretrained(trainer)
+
+    def state_dict(self):
+        return {
+            'best_metric': self._best_metric,
+        }
+
+    def load_state_dict(self, state_dict):
+        if state_dict is not None and len(state_dict) > 0:
+            self._best_metric = state_dict.get('best_metric')
+        else:
+            self.logger.warn(
+                'The state_dict is not available, the best metric value will be affected.'
+            )
diff --git a/modelscope/trainers/hooks/hook.py b/modelscope/trainers/hooks/hook.py
index 1c567f1c..d3805be8 100644
--- a/modelscope/trainers/hooks/hook.py
+++ b/modelscope/trainers/hooks/hook.py
@@ -215,3 +215,9 @@ class Hook:
                 trigger_stages.add(stage)
 
         return [stage for stage in Hook.stages if stage in trigger_stages]
+
+    def state_dict(self):
+        return {}
+
+    def load_state_dict(self, state_dict):
+        pass
diff --git a/modelscope/trainers/hooks/optimizer/base.py b/modelscope/trainers/hooks/optimizer/base.py
index dffad6ea..8c61dfdb 100644
--- a/modelscope/trainers/hooks/optimizer/base.py
+++ b/modelscope/trainers/hooks/optimizer/base.py
@@ -4,6 +4,7 @@ import logging
 from torch.nn.utils import clip_grad
 
 from modelscope.metainfo import Hooks
+from modelscope.outputs import OutputKeys
 from modelscope.trainers.hooks.builder import HOOKS
 from modelscope.trainers.hooks.hook import Hook
 from modelscope.trainers.hooks.priority import Priority
@@ -27,7 +28,7 @@ class OptimizerHook(Hook):
     def __init__(self,
                  cumulative_iters=1,
                  grad_clip=None,
-                 loss_keys='loss') -> None:
+                 loss_keys=OutputKeys.LOSS) -> None:
         if isinstance(loss_keys, str):
             loss_keys = [loss_keys]
         assert isinstance(loss_keys, (tuple, list))
diff --git a/modelscope/trainers/lrscheduler/warmup/base.py b/modelscope/trainers/lrscheduler/warmup/base.py
index 81497817..4b066281 100644
--- a/modelscope/trainers/lrscheduler/warmup/base.py
+++ b/modelscope/trainers/lrscheduler/warmup/base.py
@@ -28,10 +28,10 @@ class BaseWarmup(_LRScheduler):
         return self.base_scheduler.get_lr()
 
     def state_dict(self):
-        self.base_scheduler.state_dict()
+        return self.base_scheduler.state_dict()
 
     def load_state_dict(self, state_dict):
-        self.base_scheduler.load_state_dict(state_dict)
+        return self.base_scheduler.load_state_dict(state_dict)
 
     def scale(self):
         """Scale the learning rates.
diff --git a/modelscope/trainers/nlp/__init__.py b/modelscope/trainers/nlp/__init__.py
index 7ab8fd70..001cfefc 100644
--- a/modelscope/trainers/nlp/__init__.py
+++ b/modelscope/trainers/nlp/__init__.py
@@ -6,10 +6,12 @@ from modelscope.utils.import_utils import LazyImportModule
 if TYPE_CHECKING:
     from .sequence_classification_trainer import SequenceClassificationTrainer
     from .csanmt_translation_trainer import CsanmtTranslationTrainer
+    from .passage_ranking_trainer import PassageRankingTranier
 else:
     _import_structure = {
         'sequence_classification_trainer': ['SequenceClassificationTrainer'],
         'csanmt_translation_trainer': ['CsanmtTranslationTrainer'],
+        'passage_ranking_trainer': ['PassageRankingTrainer']
     }
 
     import sys
diff --git a/modelscope/trainers/nlp/passage_ranking_trainer.py b/modelscope/trainers/nlp/passage_ranking_trainer.py
new file mode 100644
index 00000000..e54c2904
--- /dev/null
+++ b/modelscope/trainers/nlp/passage_ranking_trainer.py
@@ -0,0 +1,197 @@
+import time
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from torch import nn
+from torch.utils.data import DataLoader, Dataset
+
+from modelscope.metainfo import Trainers
+from modelscope.models.base import Model, TorchModel
+from modelscope.msdatasets.ms_dataset import MsDataset
+from modelscope.preprocessors.base import Preprocessor
+from modelscope.trainers.base import BaseTrainer
+from modelscope.trainers.builder import TRAINERS
+from modelscope.trainers.nlp_trainer import NlpEpochBasedTrainer
+from modelscope.utils.constant import DEFAULT_MODEL_REVISION
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@dataclass
+class GroupCollator():
+    """
+    Wrapper that does conversion from List[Tuple[encode_qry, encode_psg]] to List[qry], List[psg]
+    and pass batch separately to the actual collator.
+    Abstract out data detail for the model.
+    """
+
+    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
+        if isinstance(features[0], list):
+            features = sum(features, [])
+        keys = features[0].keys()
+        batch = {k: list() for k in keys}
+        for ele in features:
+            for k, v in ele.items():
+                batch[k].append(v)
+        batch = {k: torch.cat(v, dim=0) for k, v in batch.items()}
+        return batch
+
+
+@TRAINERS.register_module(module_name=Trainers.nlp_passage_ranking_trainer)
+class PassageRankingTrainer(NlpEpochBasedTrainer):
+
+    def __init__(
+            self,
+            model: Optional[Union[TorchModel, nn.Module, str]] = None,
+            cfg_file: Optional[str] = None,
+            cfg_modify_fn: Optional[Callable] = None,
+            arg_parse_fn: Optional[Callable] = None,
+            data_collator: Optional[Callable] = None,
+            train_dataset: Optional[Union[MsDataset, Dataset]] = None,
+            eval_dataset: Optional[Union[MsDataset, Dataset]] = None,
+            preprocessor: Optional[Preprocessor] = None,
+            optimizers: Tuple[torch.optim.Optimizer,
+                              torch.optim.lr_scheduler._LRScheduler] = (None,
+                                                                        None),
+            model_revision: Optional[str] = DEFAULT_MODEL_REVISION,
+            **kwargs):
+
+        if data_collator is None:
+            data_collator = GroupCollator()
+
+        super().__init__(
+            model=model,
+            cfg_file=cfg_file,
+            cfg_modify_fn=cfg_modify_fn,
+            arg_parse_fn=arg_parse_fn,
+            data_collator=data_collator,
+            preprocessor=preprocessor,
+            optimizers=optimizers,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            model_revision=model_revision,
+            **kwargs)
+
+    def compute_mrr(self, result, k=10):
+        mrr = 0
+        for res in result.values():
+            sorted_res = sorted(res, key=lambda x: x[0], reverse=True)
+            ar = 0
+            for index, ele in enumerate(sorted_res[:k]):
+                if str(ele[1]) == '1':
+                    ar = 1.0 / (index + 1)
+                    break
+            mrr += ar
+        return mrr / len(result)
+
+    def compute_ndcg(self, result, k=10):
+        ndcg = 0
+        from sklearn import ndcg_score
+        for res in result.values():
+            sorted_res = sorted(res, key=lambda x: [0], reverse=True)
+            labels = np.array([[ele[1] for ele in sorted_res]])
+            scores = np.array([[ele[0] for ele in sorted_res]])
+            ndcg += float(ndcg_score(labels, scores, k=k))
+        ndcg = ndcg / len(result)
+        return ndcg
+
+    def evaluate(self,
+                 checkpoint_path: Optional[str] = None,
+                 *args,
+                 **kwargs) -> Dict[str, float]:
+        """evaluate a dataset
+
+        evaluate a dataset via a specific model from the `checkpoint_path` path, if the `checkpoint_path`
+        does not exist, read from the config file.
+
+        Args:
+            checkpoint_path (Optional[str], optional): the model path. Defaults to None.
+
+        Returns:
+            Dict[str, float]: the results about the evaluation
+            Example:
+            {"accuracy": 0.5091743119266054, "f1": 0.673780487804878}
+        """
+        from modelscope.models.nlp import PassageRanking
+        # get the raw online dataset
+        self.eval_dataloader = self._build_dataloader_with_dataset(
+            self.eval_dataset,
+            **self.cfg.evaluation.get('dataloader', {}),
+            collate_fn=self.eval_data_collator)
+        # generate a standard dataloader
+        # generate a model
+        if checkpoint_path is not None:
+            model = PassageRanking.from_pretrained(checkpoint_path)
+        else:
+            model = self.model
+
+        # copy from easynlp (start)
+        model.eval()
+        total_samples = 0
+
+        logits_list = list()
+        label_list = list()
+        qid_list = list()
+
+        total_spent_time = 0.0
+        device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
+        model.to(device)
+        for _step, batch in enumerate(self.eval_dataloader):
+            try:
+                batch = {
+                    key:
+                    val.to(device) if isinstance(val, torch.Tensor) else val
+                    for key, val in batch.items()
+                }
+            except RuntimeError:
+                batch = {key: val for key, val in batch.items()}
+
+            infer_start_time = time.time()
+            with torch.no_grad():
+                label_ids = batch.pop('labels').detach().cpu().numpy()
+                qids = batch.pop('qid').detach().cpu().numpy()
+                outputs = model(batch)
+            infer_end_time = time.time()
+            total_spent_time += infer_end_time - infer_start_time
+            total_samples += self.eval_dataloader.batch_size
+
+            assert 'scores' in outputs
+            logits = outputs['scores']
+
+            label_list.extend(label_ids)
+            logits_list.extend(logits)
+            qid_list.extend(qids)
+
+        logger.info('Inference time = {:.2f}s, [{:.4f} ms / sample] '.format(
+            total_spent_time, total_spent_time * 1000 / total_samples))
+
+        rank_result = {}
+        for qid, score, label in zip(qid_list, logits_list, label_list):
+            if qid not in rank_result:
+                rank_result[qid] = []
+            rank_result[qid].append((score, label))
+
+        for qid in rank_result:
+            rank_result[qid] = sorted(rank_result[qid], key=lambda x: x[0])
+
+        eval_outputs = list()
+        for metric in self.metrics:
+            if metric.startswith('mrr'):
+                k = metric.split('@')[-1]
+                k = int(k)
+                mrr = self.compute_mrr(rank_result, k=k)
+                logger.info('{}: {}'.format(metric, mrr))
+                eval_outputs.append((metric, mrr))
+            elif metric.startswith('ndcg'):
+                k = metric.split('@')[-1]
+                k = int(k)
+                ndcg = self.compute_ndcg(rank_result, k=k)
+                logger.info('{}: {}'.format(metric, ndcg))
+                eval_outputs.append(('ndcg', ndcg))
+            else:
+                raise NotImplementedError('Metric %s not implemented' % metric)
+
+        return dict(eval_outputs)
diff --git a/modelscope/trainers/nlp/space/dialog_intent_trainer.py b/modelscope/trainers/nlp/space/dialog_intent_trainer.py
new file mode 100644
index 00000000..c559ee5b
--- /dev/null
+++ b/modelscope/trainers/nlp/space/dialog_intent_trainer.py
@@ -0,0 +1,134 @@
+import os
+import time
+from typing import Callable, Dict, Optional, Tuple, Union
+
+import numpy as np
+
+from modelscope.metainfo import Trainers
+from modelscope.models.nlp.space.model.generator import SpaceGenerator
+from modelscope.models.nlp.space.model.model_base import SpaceModelBase
+from modelscope.preprocessors.space.data_loader import \
+    get_sequential_data_loader
+from modelscope.preprocessors.space.fields.intent_field import \
+    IntentBPETextField
+from modelscope.preprocessors.space.preprocess import intent_preprocess
+from modelscope.trainers.base import BaseTrainer
+from modelscope.trainers.builder import TRAINERS
+from modelscope.trainers.nlp.space.trainer.intent_trainer import IntentTrainer
+from modelscope.utils.config import Config
+from modelscope.utils.logger import get_logger
+
+PATH = None
+logger = get_logger(PATH)
+
+
+@TRAINERS.register_module(module_name=Trainers.dialog_intent_trainer)
+class DialogIntentTrainer(BaseTrainer):
+
+    def __init__(self,
+                 cfg_file: Optional[str] = None,
+                 cfg_modify_fn: Optional[Callable] = None,
+                 *args,
+                 **kwargs):
+        super().__init__(os.path.join(kwargs['model_dir'], kwargs['cfg_name']))
+
+        def to_tensor(array):
+            """
+            numpy array -> tensor
+            """
+            import torch
+            array = torch.tensor(array)
+            return array.cuda() if self.cfg.use_gpu else array
+
+        def setup_seed(seed):
+            import random
+            import torch
+            torch.manual_seed(seed)
+            torch.cuda.manual_seed_all(seed)
+            np.random.seed(seed)
+            random.seed(seed)
+            torch.backends.cudnn.deterministic = True
+
+        self.cfg_modify_fn = cfg_modify_fn
+        self.cfg = self.rebuild_config(self.cfg)
+
+        setup_seed(self.cfg.Trainer.seed)
+
+        # preprocess data
+        intent_preprocess(self.cfg.Model.init_checkpoint, self.cfg)
+        # set reader and evaluator
+        bpe = IntentBPETextField(self.cfg.Model.init_checkpoint, self.cfg)
+
+        self.cfg.Model.num_token_embeddings = bpe.vocab_size
+        self.cfg.Model.num_turn_embeddings = bpe.max_ctx_turn + 1
+        dataset_paths = [
+            os.path.join(self.cfg.Dataset.data_dir,
+                         self.cfg.Dataset.trigger_data)
+        ]
+        # set data and data status
+        collate_fn = bpe.collate_fn_multi_turn
+        self.train_label_loader = get_sequential_data_loader(
+            batch_size=self.cfg.Trainer.batch_size_label,
+            reader=bpe,
+            hparams=self.cfg,
+            data_paths=dataset_paths,
+            collate_fn=collate_fn,
+            data_type='train')
+        self.valid_label_loader = get_sequential_data_loader(
+            batch_size=self.cfg.Trainer.batch_size_label,
+            reader=bpe,
+            hparams=self.cfg,
+            data_paths=dataset_paths,
+            collate_fn=collate_fn,
+            data_type='valid')
+        self.test_label_loader = get_sequential_data_loader(
+            batch_size=self.cfg.Trainer.batch_size_label,
+            reader=bpe,
+            hparams=self.cfg,
+            data_paths=dataset_paths,
+            collate_fn=collate_fn,
+            data_type='test')
+
+        # set generator
+        generator = SpaceGenerator.create(self.cfg, reader=bpe)
+        # construct model
+        self.model = SpaceModelBase.create(
+            self.cfg.Model.init_checkpoint,
+            self.cfg,
+            reader=bpe,
+            generator=generator)
+
+        import torch
+
+        # multi-gpu
+        if self.cfg.Trainer.gpu > 1 and torch.cuda.device_count() > 1:
+            self.model = torch.nn.DataParallel(self.model)
+
+        # construct trainer
+        self.trainer = IntentTrainer(
+            self.model, to_tensor, self.cfg, reader=bpe)
+        num_batches = len(self.train_label_loader)
+        self.trainer.set_optimizers(num_training_steps_per_epoch=num_batches)
+        # load model, optimizer and lr_scheduler
+        self.trainer.load()
+
+    def rebuild_config(self, cfg: Config):
+        if self.cfg_modify_fn is not None:
+            return self.cfg_modify_fn(cfg)
+        return cfg
+
+    def train(self, *args, **kwargs):
+        logger.info('Train')
+
+        self.trainer.train(
+            train_label_iter=self.train_label_loader,
+            valid_label_iter=self.valid_label_loader)
+
+    def evaluate(self,
+                 checkpoint_path: Optional[str] = None,
+                 *args,
+                 **kwargs) -> Dict[str, float]:
+        logger.info('Evaluate')
+        self.trainer.infer(
+            data_iter=self.test_label_loader,
+            ex_data_iter=self.train_label_loader)
diff --git a/modelscope/trainers/nlp/space/dialog_modeling_trainer.py b/modelscope/trainers/nlp/space/dialog_modeling_trainer.py
new file mode 100644
index 00000000..6bdd8a3a
--- /dev/null
+++ b/modelscope/trainers/nlp/space/dialog_modeling_trainer.py
@@ -0,0 +1,130 @@
+import os
+import time
+from typing import Callable, Dict, Optional, Tuple, Union
+
+import numpy as np
+
+from modelscope.metainfo import Trainers
+from modelscope.models.nlp.space.model.generator import SpaceGenerator
+from modelscope.models.nlp.space.model.model_base import SpaceModelBase
+from modelscope.preprocessors.space.fields.gen_field import \
+    MultiWOZBPETextField
+from modelscope.trainers.base import BaseTrainer
+from modelscope.trainers.builder import TRAINERS
+from modelscope.trainers.nlp.space.eval import MultiWOZEvaluator
+from modelscope.trainers.nlp.space.trainer.gen_trainer import MultiWOZTrainer
+from modelscope.utils.config import Config, ModelFile
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+def setup_seed(seed: int):
+    import random
+    import torch
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    torch.backends.cudnn.deterministic = True
+
+
+@TRAINERS.register_module(module_name=Trainers.dialog_modeling_trainer)
+class DialogModelingTrainer(BaseTrainer):
+
+    def __init__(self,
+                 cfg_file: Optional[str] = None,
+                 cfg_modify_fn: Optional[Callable] = None,
+                 *args,
+                 **kwargs):
+
+        super().__init__(os.path.join(kwargs['model_dir'], kwargs['cfg_name']))
+
+        self.cfg_modify_fn = cfg_modify_fn
+        self.cfg = self.rebuild_config(self.cfg)
+
+        setup_seed(self.cfg.Trainer.seed)
+
+        # set reader and evaluator
+        self.bpe = MultiWOZBPETextField(self.cfg, **kwargs)
+
+        self.cfg.Model.num_token_embeddings = self.bpe.vocab_size
+        self.cfg.Model.num_turn_embeddings = self.bpe.max_ctx_turn + 1
+
+        if 'work_dir' in kwargs:
+            self.cfg.Trainer.save_dir = kwargs['work_dir']
+        else:
+            self.cfg.Trainer.save_dir = './default_save_dir'
+
+        # set data and data status
+        self.train_data = self.bpe.get_batches('train')
+        self.dev_data = self.bpe.get_batches('dev')
+
+        self.evaluator = MultiWOZEvaluator(reader=self.bpe, **kwargs)
+        # set generator
+        self.generator = SpaceGenerator.create(self.cfg, reader=self.bpe)
+        self._load_model(**kwargs)
+
+    def _load_model(self, **kwargs):
+
+        def to_tensor(array):
+            """
+            numpy array -> tensor
+            """
+            import torch
+            array = torch.tensor(array)
+            return array.cuda(
+            ) if self.cfg.use_gpu and torch.cuda.is_available() else array
+
+        # construct model
+        if 'model' in kwargs:
+            self.model = kwargs['model']
+        else:
+            self.model = SpaceModelBase.create(
+                kwargs['model_dir'],
+                self.cfg,
+                reader=self.bpe,
+                generator=self.generator)
+
+        import torch
+        # multi-gpu
+        if self.cfg.Trainer.gpu > 1 and torch.cuda.device_count() > 1:
+            self.model = torch.nn.DataParallel(self.model)
+
+        # construct trainer
+        self.trainer = MultiWOZTrainer(
+            self.model,
+            to_tensor,
+            self.cfg,
+            reader=self.bpe,
+            evaluator=self.evaluator)
+        self.trainer.set_optimizers()
+        # load model, optimizer and lr_scheduler
+        self.trainer.load()
+
+    def rebuild_config(self, cfg: Config):
+        if self.cfg_modify_fn is not None:
+            return self.cfg_modify_fn(cfg)
+        return cfg
+
+    def train(self, *args, **kwargs):
+        logger.info('Train')
+
+        self.trainer.train(train_data=self.train_data, dev_data=self.dev_data)
+
+    def evaluate(self,
+                 checkpoint_path: Optional[str] = None,
+                 *args,
+                 **kwargs) -> Dict[str, float]:
+        logger.info('Evaluate')
+        self.cfg.do_infer = True
+
+        # get best checkpoint path
+        pos = checkpoint_path.rfind('/')
+        checkpoint_name = checkpoint_path[pos + 1:]
+        checkpoint_dir = checkpoint_path[:pos]
+
+        assert checkpoint_name == ModelFile.TORCH_MODEL_BIN_FILE
+        kwargs['model_dir'] = checkpoint_dir
+        self._load_model(**kwargs)
+        self.trainer.infer(data_type='test')
diff --git a/modelscope/trainers/nlp/space/eval.py b/modelscope/trainers/nlp/space/eval.py
new file mode 100644
index 00000000..f315ff07
--- /dev/null
+++ b/modelscope/trainers/nlp/space/eval.py
@@ -0,0 +1,952 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright from https://github.com/thu-spmi/LABES
+# Copyright from https://github.com/TonyNemo/UBAR-MultiWOZ
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from collections import Counter
+
+import json
+import numpy as np
+from nltk.util import ngrams
+from sklearn.metrics import f1_score
+
+from modelscope.utils.nlp.space import ontology, utils
+from modelscope.utils.nlp.space.clean_dataset import clean_slot_values
+
+
+def similar(a, b):
+    return a == b or a in b or b in a or a.split()[0] == b.split(
+    )[0] or a.split()[-1] == b.split()[-1]
+
+
+def setsub(a, b):
+    junks_a = []
+    useless_constraint = [
+        'temperature', 'week', 'est ', 'quick', 'reminder', 'near'
+    ]
+    for i in a:
+        flg = False
+        for j in b:
+            if similar(i, j):
+                flg = True
+        if not flg:
+            junks_a.append(i)
+    for junk in junks_a:
+        flg = False
+        for item in useless_constraint:
+            if item in junk:
+                flg = True
+        if not flg:
+            return False
+    return True
+
+
+def setsim(a, b):
+    a, b = set(a), set(b)
+    return setsub(a, b) and setsub(b, a)
+
+
+def DA_evaluate(preds, labels):
+    preds = np.array(preds)
+    labels = np.array(labels)
+    results = {}
+
+    for avg_name in ['micro']:
+        my_f1_score = f1_score(y_true=labels, y_pred=preds, average=avg_name)
+        results['f1_{}'.format(avg_name)] = my_f1_score
+
+    return results
+
+
+class BLEUScorer(object):
+    # BLEU score calculator via GentScorer interface
+    # it calculates the BLEU-4 by taking the entire corpus in
+    # Calulate based multiple candidates against multiple references
+    def __init__(self):
+        pass
+
+    def score(self, parallel_corpus):
+
+        # containers
+        count = [0, 0, 0, 0]
+        clip_count = [0, 0, 0, 0]
+        r = 0
+        c = 0
+        weights = [0.25, 0.25, 0.25, 0.25]
+
+        # accumulate ngram statistics
+        for hyps, refs in parallel_corpus:
+            hyps = [hyp.split() for hyp in hyps]
+            refs = [ref.split() for ref in refs]
+            for hyp in hyps:
+
+                for i in range(4):
+                    # accumulate ngram counts
+                    hypcnts = Counter(ngrams(hyp, i + 1))
+                    cnt = sum(hypcnts.values())
+                    count[i] += cnt
+
+                    # compute clipped counts
+                    max_counts = {}
+                    for ref in refs:
+                        refcnts = Counter(ngrams(ref, i + 1))
+                        for ng in hypcnts:
+                            max_counts[ng] = max(
+                                max_counts.get(ng, 0), refcnts[ng])
+                    clipcnt = \
+                        dict((ng, min(count, max_counts[ng])) for ng, count in hypcnts.items())
+                    clip_count[i] += sum(clipcnt.values())
+
+                # accumulate r & c
+                bestmatch = [1000, 1000]
+                for ref in refs:
+                    if bestmatch[0] == 0:
+                        break
+                    diff = abs(len(ref) - len(hyp))
+                    if diff < bestmatch[0]:
+                        bestmatch[0] = diff
+                        bestmatch[1] = len(ref)
+                r += bestmatch[1]
+                c += len(hyp)
+
+        # computing bleu score
+        p0 = 1e-7
+        bp = \
+            1 if c > r else math.exp(1 - float(r) / float(c))
+        p_ns = \
+            [float(clip_count[i]) / float(count[i] + p0) + p0 for i in range(4)]
+        s = \
+            math.fsum(w * math.log(p_n) for w, p_n in zip(weights, p_ns) if p_n)
+        bleu = bp * math.exp(s)
+        return bleu * 100
+
+
+""""
+For the data preparation and evaluation on MultiWOZ2.0/2.1,
+we refer to the code of UBAR (https://github.com/TonyNemo/UBAR-MultiWOZ)
+"""
+
+
+class MultiWOZEvaluator(object):
+
+    def __init__(self, reader, **kwargs):
+        self.reader = reader
+        self.domains = ontology.all_domains
+        self.all_data = self.reader.data
+        self.test_data = self.reader.test
+
+        self.bleu_scorer = BLEUScorer()
+
+        self.all_info_slot = []
+        for d, s_list in ontology.informable_slots.items():
+            for s in s_list:
+                self.all_info_slot.append(d + '-' + s)
+
+        # only evaluate these slots for dialog success
+        self.requestables = ['phone', 'address', 'postcode', 'reference', 'id']
+        self.db_dir = kwargs['data_dir']
+
+    def pack_dial(self, data):
+        dials = {}
+        for turn in data:
+            dial_id = turn['dial_id']
+            if dial_id not in dials:
+                dials[dial_id] = []
+            dials[dial_id].append(turn)
+        return dials
+
+    def validation_metric(self, data, fout=None):
+        bleu = self.bleu_metric(data)
+        # accu_single_dom, accu_multi_dom, multi_dom_num = self.domain_eval(data)
+        success, match, req_offer_counts, dial_num = \
+            self.context_to_response_eval(data, same_eval_as_cambridge=True, fout=fout)
+        return bleu, success, match
+
+    def bleu_metric(self, data, eval_dial_list=None):
+        gen, truth = [], []
+        for row in data:
+            if eval_dial_list and row[
+                    'dial_id'] + '.json' not in eval_dial_list:
+                continue
+            gen.append(row['resp_gen'])
+            truth.append(row['resp'])
+        wrap_generated = [[_] for _ in gen]
+        wrap_truth = [[_] for _ in truth]
+        if gen and truth:
+            try:
+                sc = self.bleu_scorer.score(zip(wrap_generated, wrap_truth))
+            except Exception:
+                sc = 0.0
+        else:
+            sc = 0.0
+        return sc
+
+    def context_to_response_eval(self,
+                                 data,
+                                 eval_dial_list=None,
+                                 same_eval_as_cambridge=False,
+                                 fout=None):
+        dials = self.pack_dial(data)
+        counts = {}
+        for req in self.requestables:
+            counts[req + '_total'] = 0
+            counts[req + '_offer'] = 0
+
+        dial_num, successes, matches = 0, 0, 0
+
+        for dial_id in dials:
+            if eval_dial_list and dial_id + '.json' not in eval_dial_list:
+                continue
+            dial = dials[dial_id]
+            reqs = {}
+            goal = {}
+            if '.json' not in dial_id and '.json' in list(
+                    self.all_data.keys())[0]:
+                dial_id = dial_id + '.json'
+            for domain in ontology.all_domains:
+                if self.all_data[dial_id]['goal'].get(domain):
+                    true_goal = self.all_data[dial_id]['goal']
+                    goal = self._parseGoal(goal, true_goal, domain)
+
+            for domain in goal.keys():
+                reqs[domain] = goal[domain]['requestable']
+
+            success, match, stats, counts = \
+                self._evaluateGeneratedDialogue(dial, goal, reqs, counts,
+                                                same_eval_as_cambridge=same_eval_as_cambridge, fout=fout)
+
+            successes += success
+            matches += match
+            dial_num += 1
+
+        succ_rate = successes / (float(dial_num) + 1e-10) * 100
+        match_rate = matches / (float(dial_num) + 1e-10) * 100
+        return succ_rate, match_rate, counts, dial_num
+
+    def _evaluateGeneratedDialogue(self,
+                                   dialog,
+                                   goal,
+                                   real_requestables,
+                                   counts,
+                                   soft_acc=False,
+                                   same_eval_as_cambridge=False,
+                                   fout=None):
+        """Evaluates the dialogue created by the model.
+            First we load the user goal of the dialogue, then for each turn
+            generated by the system we look for key-words.
+            For the Inform rate we look whether the entity was proposed.
+            For the Success rate we look for requestables slots"""
+        # for computing corpus success
+        requestables = self.requestables
+
+        # CHECK IF MATCH HAPPENED
+        provided_requestables = {}
+        venue_offered = {}
+        domains_in_goal = []
+        log = []
+        bspans = {}
+
+        for domain in goal.keys():
+            venue_offered[domain] = []
+            provided_requestables[domain] = []
+            domains_in_goal.append(domain)
+
+        for t, turn in enumerate(dialog):
+            if t == 0:
+                continue
+            if fout is not None:
+                log.append({
+                    'turn_num': turn['turn_num'],
+                    'turn_domain': turn['dspn'],
+                    'user': turn['user'],
+                    'aspn': turn['aspn'],
+                    'aspn_gen': turn['aspn_gen'],
+                    'resp': turn['resp'],
+                    'resp_gen': turn['resp_gen'],
+                    'pointer': turn['pointer'],
+                })
+
+            sent_t = turn['resp_gen']
+
+            for domain in goal.keys():
+                # for computing success
+                if same_eval_as_cambridge:
+                    # [restaurant_name], [hotel_name] instead of [value_name]
+                    if self.reader.use_true_domain_for_ctr_eval:
+                        dom_pred = [d[1:-1] for d in turn['dspn'].split()]
+                    else:
+                        dom_pred = [d[1:-1] for d in turn['dspn_gen'].split()]
+
+                    if domain not in dom_pred:  # fail
+                        continue
+                if '[value_name]' in sent_t or '[value_id]' in sent_t:
+                    if domain in [
+                            'restaurant', 'hotel', 'attraction', 'train'
+                    ]:
+                        # HERE YOU CAN PUT YOUR BELIEF STATE ESTIMATION
+                        if not self.reader.use_true_curr_bspn and not self.reader.use_true_bspn_for_ctr_eval:
+                            bspn = turn['bspn_gen']
+                        else:
+                            bspn = turn['bspn']
+
+                        constraint_dict = self.reader.bspan_to_constraint_dict(
+                            bspn)
+                        if constraint_dict.get(domain):
+                            venues = self.reader.db.queryJsons(
+                                domain,
+                                constraint_dict[domain],
+                                return_name=True)
+                        else:
+                            venues = []
+
+                        if len(venue_offered[domain]) == 0 and venues:
+
+                            venue_offered[domain] = venues
+                            bspans[domain] = constraint_dict[domain]
+                        else:
+                            flag = False
+                            for ven in venues:
+                                if ven not in venue_offered[domain]:
+                                    flag = True
+                                    break
+                            if flag and venues:  # sometimes there are no results so sample won't work
+                                venue_offered[domain] = venues
+                                bspans[domain] = constraint_dict[domain]
+                    else:  # not limited so we can provide one
+                        venue_offered[domain] = '[value_name]'
+
+                # ATTENTION: assumption here - we didn't provide phone or address twice! etc
+                for requestable in requestables:
+                    if requestable == 'reference':
+                        if '[value_reference]' in sent_t:
+                            if domain in ['restaurant', 'hotel', 'train']:
+                                if 'booked' in turn['pointer'] or 'ok' in turn[
+                                        'pointer'] or '[value_reference]' in turn[
+                                            'resp']:
+                                    # if pointer was allowing for that?
+                                    provided_requestables[domain].append(
+                                        'reference')
+                            else:
+                                provided_requestables[domain].append(
+                                    'reference')
+                    else:
+                        if '[value_' + requestable + ']' in sent_t:
+                            provided_requestables[domain].append(requestable)
+
+        # if name was given in the task
+        for domain in goal.keys():
+            # if name was provided for the user, the match is being done automatically
+            if 'name' in goal[domain]['informable']:
+                venue_offered[domain] = '[value_name]'
+
+            # special domains - entity does not need to be provided
+            if domain in ['taxi', 'police', 'hospital']:
+                venue_offered[domain] = '[value_name]'
+
+            if domain == 'train':
+                if not venue_offered[domain] and 'id' not in goal[domain][
+                        'requestable']:
+                    venue_offered[domain] = '[value_name]'
+        """
+        Given all inform and requestable slots
+        we go through each domain from the user goal
+        and check whether right entity was provided and
+        all requestable slots were given to the user.
+        The dialogue is successful if that's the case for all domains.
+        """
+        # HARD EVAL
+        stats = {
+            'restaurant': [0, 0, 0],
+            'hotel': [0, 0, 0],
+            'attraction': [0, 0, 0],
+            'train': [0, 0, 0],
+            'taxi': [0, 0, 0],
+            'hospital': [0, 0, 0],
+            'police': [0, 0, 0]
+        }
+
+        match = 0
+        success = 0
+        # MATCH
+        for domain in goal.keys():
+            match_stat = 0
+            if domain in ['restaurant', 'hotel', 'attraction', 'train']:
+                goal_venues = self.reader.db.queryJsons(
+                    domain, goal[domain]['informable'], return_name=True)
+                if type(venue_offered[domain]
+                        ) is str and '_name' in venue_offered[domain]:
+                    match += 1
+                    match_stat = 1
+                elif len(venue_offered[domain]) > 0 and len(
+                        set(venue_offered[domain]) & set(goal_venues)) > 0:
+                    match += 1
+                    match_stat = 1
+            else:
+                if '_name]' in venue_offered[domain]:
+                    match += 1
+                    match_stat = 1
+
+            stats[domain][0] = match_stat
+            stats[domain][2] = 1
+
+        if soft_acc:
+            match = float(match) / len(goal.keys())
+        else:
+            if match == len(goal.keys()):
+                match = 1.0
+            else:
+                match = 0.0
+
+        for domain in domains_in_goal:
+            for request in real_requestables[domain]:
+                counts[request + '_total'] += 1
+                if request in provided_requestables[domain]:
+                    counts[request + '_offer'] += 1
+
+        # SUCCESS
+        if fout is not None:
+            for domain in domains_in_goal:
+                success_stat = 0
+                domain_success = 0
+                if len(real_requestables[domain]) == 0:
+                    success += 1
+                    success_stat = 1
+                    stats[domain][1] = success_stat
+                    continue
+                # if values in sentences are super set of requestables
+                for request in real_requestables[domain]:
+                    if request in provided_requestables[domain]:
+                        domain_success += 1
+
+                if domain_success == len(real_requestables[domain]):
+                    success += 1
+                    success_stat = 1
+
+                stats[domain][1] = success_stat
+
+            # final eval
+            if soft_acc:
+                success = float(success) / len(real_requestables)
+            else:
+                if success >= len(real_requestables):
+                    success = 1
+                else:
+                    success = 0
+        else:
+            if match == 1.0:
+                for domain in domains_in_goal:
+                    success_stat = 0
+                    domain_success = 0
+                    if len(real_requestables[domain]) == 0:
+                        success += 1
+                        success_stat = 1
+                        stats[domain][1] = success_stat
+                        continue
+                    # if values in sentences are super set of requestables
+                    for request in real_requestables[domain]:
+                        if request in provided_requestables[domain]:
+                            domain_success += 1
+
+                    if domain_success == len(real_requestables[domain]):
+                        success += 1
+                        success_stat = 1
+
+                    stats[domain][1] = success_stat
+
+                # final eval
+                if soft_acc:
+                    success = float(success) / len(real_requestables)
+                else:
+                    if success >= len(real_requestables):
+                        success = 1
+                    else:
+                        success = 0
+
+        if fout is not None and success == 0:
+            sample = {
+                dialog[0]['dial_id']: {
+                    'log': log,
+                    'real_requestables': real_requestables,
+                    'provided_requestables': provided_requestables
+                }
+            }
+            line = json.dumps(sample)
+            fout.write(line)
+            fout.write('\n')
+
+        return success, match, stats, counts
+
+    def _parseGoal(self, goal, true_goal, domain):
+        """Parses user goal into dictionary format."""
+        goal[domain] = {}
+        goal[domain] = {'informable': {}, 'requestable': [], 'booking': []}
+        if 'info' in true_goal[domain]:
+            if domain == 'train':
+                # we consider dialogues only where train had to be booked!
+                if 'book' in true_goal[domain]:
+                    goal[domain]['requestable'].append('reference')
+                if 'reqt' in true_goal[domain]:
+                    if 'id' in true_goal[domain]['reqt']:
+                        goal[domain]['requestable'].append('id')
+            else:
+                if 'reqt' in true_goal[domain]:
+                    for s in true_goal[domain]['reqt']:  # addtional requests:
+                        if s in [
+                                'phone', 'address', 'postcode', 'reference',
+                                'id'
+                        ]:
+                            # ones that can be easily delexicalized
+                            goal[domain]['requestable'].append(s)
+                if 'book' in true_goal[domain]:
+                    goal[domain]['requestable'].append('reference')
+
+            for s, v in true_goal[domain]['info'].items():
+                s_, v_ = clean_slot_values(self.db_dir, domain, s, v)
+                if len(v_.split()) > 1:
+                    v_ = ' '.join(
+                        [token.text for token in self.reader.nlp(v_)]).strip()
+                goal[domain]['informable'][s_] = v_
+
+            if 'book' in true_goal[domain]:
+                goal[domain]['booking'] = true_goal[domain]['book']
+        return goal
+
+
+class GenericEvaluator:
+
+    def __init__(self, reader):
+        self.reader = reader
+        self.metric_dict = {}
+
+    def pack_dial(self, data):
+        dials = {}
+        for turn in data:
+            dial_id = turn['dial_id']
+            if dial_id not in dials:
+                dials[dial_id] = []
+            dials[dial_id].append(turn)
+        return dials
+
+    def run_metrics(self, results):
+        raise ValueError('Please specify the evaluator first')
+
+    def bleu_metric(self, data, type='bleu'):
+        gen, truth = [], []
+        for row in data:
+            gen.append(self.clean(row['resp_gen']))
+            # gen.append(self.clean(row['resp']))
+            truth.append(self.clean(row['resp']))
+        wrap_generated = [[_] for _ in gen]
+        wrap_truth = [[_] for _ in truth]
+        sc = BLEUScorer().score(zip(wrap_generated, wrap_truth))
+        return sc
+
+    def _normalize_constraint(self,
+                              constraint,
+                              ignore_dontcare=False,
+                              intersection=True):
+        """
+        Normalize belief span, e.g. delete repeated words
+        :param constraint - {'food': 'asian oritental', 'pricerange': 'cheap'}
+        :param intersection: if true, only keeps the words that appear in th ontology
+                                        we set intersection=True as in previous works
+        :returns: normalized constraint dict
+                      e.g. - {'food': 'asian oritental', 'pricerange': 'cheap', 'area': ''}
+        """
+        normalized = {}
+        for s in self.informable_slots:
+            normalized[s] = ''
+        for s, v in constraint.items():
+            if ignore_dontcare and v == 'dontcare':
+                continue
+            if intersection and v != 'dontcare' and v not in self.entities_flat:
+                continue
+
+            normalized[s] = v
+
+        return normalized
+
+    def _normalize_act(self, aspn, intersection=False):
+        aspn_list = aspn.split('|')
+        normalized = {}
+        for i, v in enumerate(aspn_list):
+            seq = v.strip()
+            word_set = set()
+            for w in seq.split():
+                if intersection:
+                    if self.reader.act_order[i] == 'av':
+                        if '[value' in w:
+                            word_set.add(w)
+                    else:
+                        if w in self.requestable_slots:
+                            word_set.add(w)
+                else:
+                    word_set.add(w)
+            normalized[self.reader.act_order[i]] = word_set
+        return normalized
+
+    def tracker_metric(self, data, normalize=True):
+        # turn level metric
+        tp, fp, fn, db_correct = 0, 0, 0, 0
+        goal_accr, slot_accr, total = 0, {}, 1e-8
+        for s in self.informable_slots:
+            slot_accr[s] = 0
+
+        for row in data:
+            if normalize:
+                gen = self._normalize_constraint(row['bspn_gen'])
+                truth = self._normalize_constraint(row['bspn'])
+            else:
+                gen = self._normalize_constraint(
+                    row['bspn_gen'], intersection=False)
+                truth = self._normalize_constraint(
+                    row['bspn'], intersection=False)
+            valid = 'thank' not in row['user'] and 'bye' not in row['user']
+            if valid:
+                for slot, value in gen.items():
+                    if value in truth[slot]:
+                        tp += 1
+                    else:
+                        fp += 1
+                for slot, value in truth.items():
+                    if value not in gen[slot]:
+                        fn += 1
+
+            if truth and valid:
+                total += 1
+                for s in self.informable_slots:
+                    if gen[s] == truth[s]:
+                        slot_accr[s] += 1
+                if gen == truth:
+                    goal_accr += 1
+                if row.get('db_gen') and row.get('db_match'):
+                    if row['db_gen'] == row['db_match']:
+                        db_correct += 1
+        precision, recall = tp / (tp + fp + 1e-8), tp / (tp + fn + 1e-8)
+        f1 = 2 * precision * recall / (precision + recall + 1e-8)
+        goal_accr /= total
+        db_correct /= total
+        for s in slot_accr:
+            slot_accr[s] /= total
+        return precision, recall, f1, goal_accr, slot_accr, db_correct
+
+    def request_metric(self, data):
+        # dialog level metric
+        dials = self.pack_dial(data)
+        tp, fp, fn = 0, 0, 0
+        for dial_id in dials:
+            truth_req, gen_req = set(), set()
+            dial = dials[dial_id]
+            for turn_num, turn in enumerate(dial):
+                resp_gen_token = self.clean(turn['resp_gen']).split()
+                resp_token = self.clean(turn['resp']).split()
+                for w in resp_gen_token:
+                    if '[value_' in w and w.endswith(
+                            ']') and w != '[value_name]':
+                        gen_req.add(w[1:-1].split('_')[1])
+                for w in resp_token:
+                    if '[value_' in w and w.endswith(
+                            ']') and w != '[value_name]':
+                        truth_req.add(w[1:-1].split('_')[1])
+            for req in gen_req:
+                if req in truth_req:
+                    tp += 1
+                else:
+                    fp += 1
+            for req in truth_req:
+                if req not in gen_req:
+                    fn += 1
+        precision, recall = tp / (tp + fp + 1e-8), tp / (tp + fn + 1e-8)
+        f1 = 2 * precision * recall / (precision + recall + 1e-8)
+        return f1, precision, recall
+
+    def act_metric(self, data):
+        # turn level metric
+        tp, fp, fn = {
+            'all_s': 0,
+            'all_v': 0
+        }, {
+            'all_s': 0,
+            'all_v': 0
+        }, {
+            'all_s': 0,
+            'all_v': 0
+        }
+        for s in self.requestable_slots:
+            tp[s], fp[s], fn[s] = 0, 0, 0
+            tp['[value_%s]' % s], fp['[value_%s]' % s], fn['[value_%s]'
+                                                           % s] = 0, 0, 0
+
+        for row in data:
+            gen = self._normalize_act(row['aspn_gen'])
+            truth = self._normalize_act(row['aspn'])
+            valid = 'thank' not in row['user'] and 'bye' not in row['user']
+            if valid:
+                # how well the act decoder captures user's requests
+                for value in gen['av']:
+                    if value in truth['av']:
+                        tp['all_v'] += 1
+                        if tp.get(value):
+                            tp[value] += 1
+                    else:
+                        fp['all_v'] += 1
+                        if fp.get(value):
+                            fp[value] += 1
+                for value in truth['av']:
+                    if value not in gen['av']:
+                        fn['all_v'] += 1
+                        if fn.get(value):
+                            fn[value] += 1
+
+                # how accurately the act decoder predicts system's question
+                if 'as' not in gen:
+                    continue
+                for slot in gen['as']:
+                    if slot in truth['as']:
+                        tp['all_s'] += 1
+                        if tp.get(slot):
+                            tp[slot] += 1
+                    else:
+                        fp['all_s'] += 1
+                        if fp.get(slot):
+                            fp[slot] += 1
+                for slot in truth['as']:
+                    if slot not in gen['as']:
+                        fn['all_s'] += 1
+                        if fn.get(slot):
+                            fn[slot] += 1
+
+        result = {}
+        for k, v in tp.items():
+            precision, recall = tp[k] / (tp[k] + fp[k] + 1e-8), tp[k] / (
+                tp[k] + fn[k] + 1e-8)
+            f1 = 2 * precision * recall / (precision + recall + 1e-8)
+            result[k] = [f1, precision, recall]
+        return result
+
+
+"""
+For the data preparation and evaluation on In-Car Assistant/CamRest,
+we refer to the code of LABES (https://github.com/thu-spmi/LABES)
+"""
+
+
+class CamRestEvaluator(GenericEvaluator):
+
+    def __init__(self, reader):
+        super().__init__(reader)
+        self.entities_flat, self.entitiy_to_slot_dict = self.get_entities(
+            self.reader.ontology_path)
+        self.informable_slots = self.reader.otlg.informable_slots
+        self.requestable_slots = self.reader.otlg.requestable_slots
+
+    def run_metrics(self, results):
+        metrics = {}
+        bleu = self.bleu_metric(results)
+        p, r, f1, goal_acc, slot_acc, db_acc = self.tracker_metric(results)
+        match = self.match_metric(results)
+        req_f1, req_p, req_r = self.request_metric(results)
+
+        metrics['bleu'] = bleu
+        metrics['match'] = match
+        metrics['req_f1'] = req_f1
+        metrics['joint_goal'] = goal_acc
+        metrics['slot_accu'] = slot_acc
+        metrics['slot-p/r/f1'] = (p, r, f1)
+        metrics['db_acc'] = db_acc
+
+        return metrics
+
+    def get_entities(self, entity_path):
+        entities_flat = []
+        entitiy_to_slot_dict = {}
+        raw_entities = json.loads(open(entity_path).read().lower())
+        for s in raw_entities['informable']:
+            entities_flat.extend(raw_entities['informable'][s])
+            for v in raw_entities['informable'][s]:
+                entitiy_to_slot_dict[v] = s
+        return entities_flat, entitiy_to_slot_dict
+
+    def constraint_same(self, truth_cons, gen_cons):
+        if not truth_cons and not gen_cons:
+            return True
+        if not truth_cons or not gen_cons:
+            return False
+        return setsim(gen_cons, truth_cons)
+
+    def match_metric(self, data):
+        dials = self.pack_dial(data)
+        match, total = 0, 1e-8
+        for dial_id in dials:
+            dial = dials[dial_id]
+            truth_cons, gen_cons = {'1': '', '2': '', '3': ''}, None
+            for turn_num, turn in enumerate(dial):
+                # find the last turn which the system provide an entity
+                if '[value' in turn['resp_gen']:
+                    gen_cons = self._normalize_constraint(
+                        turn['bspn_gen'], ignore_dontcare=True)
+                if '[value' in turn['resp']:
+                    truth_cons = self._normalize_constraint(
+                        turn['bspn'], ignore_dontcare=True)
+            if not gen_cons:
+                # if no entity is provided, choose the state of the last dialog turn
+                gen_cons = self._normalize_constraint(
+                    dial[-1]['bspn_gen'], ignore_dontcare=True)
+            if list(truth_cons.values()) != ['', '', '']:
+                if gen_cons == truth_cons:
+                    match += 1
+                total += 1
+
+        return match / total
+
+    def clean(self, resp):
+        # we  use the same clean process as in Sequicity, SEDST, FSDM
+        # to ensure comparable results
+        resp = resp.replace(f'{self.reader.sos_r_token} ', '')
+        resp = resp.replace(f' {self.reader.eos_r_token}', '')
+        resp = f'{self.reader.sos_r_token} {resp} {self.reader.eos_r_token}'
+        for value, slot in self.entitiy_to_slot_dict.items():
+
+            resp = utils.clean_replace(resp, value, '[value_%s]' % slot)
+        return resp
+
+
+class KvretEvaluator(GenericEvaluator):
+
+    def __init__(self, reader):
+        super().__init__(reader)
+        self.entities_flat, self.entitiy_to_slot_dict = self.get_entities(
+            self.reader.ontology_path)
+        self.informable_slots = self.reader.otlg.informable_slots
+        self.requestable_slots = self.reader.otlg.requestable_slots
+
+    def run_metrics(self, results):
+        metrics = {}
+        bleu = self.bleu_metric(results)
+        p, r, f1, goal_acc, slot_acc, db_acc = self.tracker_metric(
+            results, normalize=True)
+        match = self.match_metric(results)
+        req_f1, req_p, req_r = self.request_metric(results)
+
+        metrics['bleu'] = bleu
+        metrics['match'] = match
+        metrics['req_f1'] = req_f1
+        metrics['joint_goal'] = goal_acc
+        metrics['slot_accu'] = slot_acc
+        metrics['slot-p/r/f1'] = (p, r, f1)
+        metrics['db_acc'] = db_acc
+
+        return metrics
+
+    def _normalize_constraint(self,
+                              constraint,
+                              ignore_dontcare=False,
+                              intersection=True):
+        """
+        Normalize belief span, e.g. delete repeated words
+        :param constraint - {'food': 'asian oritental', 'pricerange': 'cheap'}
+        :param intersection: if true, only keeps the words that appear in th ontology
+                                        we set intersection=True as in previous works
+        :returns: normalized constraint dict
+                      e.g. - {'food': 'asian oritental', 'pricerange': 'cheap', 'area': ''}
+        """
+        junk = [
+            'good', 'great', 'quickest', 'shortest', 'route', 'week',
+            'fastest', 'nearest', 'next', 'closest', 'way', 'mile', 'activity',
+            'restaurant', 'appointment'
+        ]
+        normalized = {}
+        for s in self.informable_slots:
+            normalized[s] = ''
+        for s, v in constraint.items():
+            for j in junk:
+                v = ' '.join(v.replace(j, '').split())
+            if intersection and v not in self.entities_flat:
+                continue
+
+            if s in self.informable_slots:
+                normalized[s] = v
+            else:
+                # TODO only use slot (not domain) in s for matching !!!
+                pass
+
+        return normalized
+
+    def get_entities(self, entity_path):
+        entities_flat = []
+        entitiy_to_slot_dict = {}
+
+        entitiy_to_slot_dict = self.reader.entity_dict
+        for s in entitiy_to_slot_dict:
+            if s not in entities_flat:
+                entities_flat.append(s)
+        return entities_flat, entitiy_to_slot_dict
+
+    def constraint_same(self, truth_cons, gen_cons):
+        if not truth_cons and not gen_cons:
+            return True
+        if not truth_cons or not gen_cons:
+            return False
+        return setsim(gen_cons, truth_cons)
+
+    def match_metric(self, data):
+        dials = self.pack_dial(data)
+        match, total = 0, 1e-8
+        for dial_id in dials:
+            dial = dials[dial_id]
+            truth_cons, gen_cons = {
+                '1': '',
+                '2': '',
+                '3': '',
+                '4': '',
+                '5': '',
+                '6': '',
+                '7': '',
+                '8': '',
+                '9': '',
+                '10': '',
+                '11': ''
+            }, None
+            for turn_num, turn in enumerate(dial):
+                # find the last turn which the system provide an entity
+                if '[value' in turn['resp_gen']:
+                    gen_cons = self._normalize_constraint(
+                        turn['bspn_gen'], ignore_dontcare=True)
+                if '[value' in turn['resp']:
+                    truth_cons = self._normalize_constraint(
+                        turn['bspn'], ignore_dontcare=True)
+
+            if not gen_cons:
+                # if no entity is provided, choose the state of the last dialog turn
+                gen_cons = self._normalize_constraint(
+                    dial[-1]['bspn_gen'], ignore_dontcare=True)
+
+            if list(truth_cons.values()) != [''] * 11:
+                gen_cons = [x for x in gen_cons.values() if x]
+                truth_cons = [x for x in truth_cons.values() if x]
+                if self.constraint_same(gen_cons, truth_cons):
+                    match += 1
+                total += 1
+
+        return match / total
+
+    def clean(self, resp):
+        # we  use the same clean process as in Sequicity, SEDST, FSDM
+        # to ensure comparable results
+        resp = resp.replace(f'{self.reader.sos_r_token} ', '')
+        resp = resp.replace(f' {self.reader.eos_r_token}', '')
+        resp = f'{self.reader.sos_r_token} {resp} {self.reader.eos_r_token}'
+        for value, slot in self.entitiy_to_slot_dict.items():
+            resp = utils.clean_replace(resp, value, '[value_%s]' % slot)
+        return resp
diff --git a/modelscope/trainers/nlp/space/trainer/gen_trainer.py b/modelscope/trainers/nlp/space/trainer/gen_trainer.py
index aa28d798..34cd2f9b 100644
--- a/modelscope/trainers/nlp/space/trainer/gen_trainer.py
+++ b/modelscope/trainers/nlp/space/trainer/gen_trainer.py
@@ -15,27 +15,11 @@ from transformers.optimization import AdamW, get_linear_schedule_with_warmup
 
 from modelscope.trainers.nlp.space.metrics.metrics_tracker import \
     MetricsTracker
+from modelscope.utils.constant import ModelFile
+from modelscope.utils.logger import get_logger
 from modelscope.utils.nlp.space import ontology
 
 
-def get_logger(log_path, name='default'):
-    logger = logging.getLogger(name)
-    logger.propagate = False
-    logger.setLevel(logging.DEBUG)
-
-    formatter = logging.Formatter('%(message)s')
-
-    sh = logging.StreamHandler(sys.stdout)
-    sh.setFormatter(formatter)
-    logger.addHandler(sh)
-
-    fh = logging.FileHandler(log_path, mode='w')
-    fh.setFormatter(formatter)
-    logger.addHandler(fh)
-
-    return logger
-
-
 class Trainer(object):
 
     def __init__(self,
@@ -51,15 +35,16 @@ class Trainer(object):
 
         self.do_train = config.do_train
         self.do_infer = config.do_infer
-        self.is_decreased_valid_metric = config.Trainer.valid_metric_name[
-            0] == '-'
-        self.valid_metric_name = config.Trainer.valid_metric_name[1:]
-        self.num_epochs = config.Trainer.num_epochs
-        # self.save_dir = config.Trainer.save_dir
-        self.log_steps = config.Trainer.log_steps
-        self.valid_steps = config.Trainer.valid_steps
-        self.save_checkpoint = config.Trainer.save_checkpoint
-        self.save_summary = config.Trainer.save_summary
+        if self.do_train:
+            self.is_decreased_valid_metric = config.Trainer.valid_metric_name[
+                0] == '-'
+            self.valid_metric_name = config.Trainer.valid_metric_name[1:]
+            self.num_epochs = config.Trainer.num_epochs
+            self.save_dir = config.Trainer.save_dir
+            self.log_steps = config.Trainer.log_steps
+            self.valid_steps = config.Trainer.valid_steps
+            self.save_checkpoint = config.Trainer.save_checkpoint
+            self.save_summary = config.Trainer.save_summary
         self.lr = config.Model.lr
         self.weight_decay = config.Model.weight_decay
         self.batch_size = config.Trainer.batch_size
@@ -71,22 +56,21 @@ class Trainer(object):
         self.optimizer = optimizer
 
         self.model = model
-        self.func_model = self.model.module if self.gpu > 1 else self.model
+        self.func_model = self.model.module if self.gpu > 1 and config.use_gpu else self.model
         self.reader = reader
         self.evaluator = evaluator
         self.tokenizer = reader.tokenizer
 
-        # if not os.path.exists(self.save_dir):
-        #     os.makedirs(self.save_dir)
-
-        # self.logger = logger or get_logger(os.path.join(self.save_dir, "trainer.log"), "trainer")
-        self.logger = logger or get_logger('trainer.log', 'trainer')
+        self.logger = get_logger()
 
         self.batch_metrics_tracker = MetricsTracker()
         self.token_metrics_tracker = MetricsTracker()
 
-        self.best_valid_metric = float(
-            'inf' if self.is_decreased_valid_metric else '-inf')
+        if self.do_train:
+            if not os.path.exists(self.save_dir):
+                os.makedirs(self.save_dir)
+            self.best_valid_metric = float(
+                'inf' if self.is_decreased_valid_metric else '-inf')
         self.epoch = 0
 
     def decode_generated_bspn_resp(self, generated):
@@ -248,9 +232,12 @@ class Trainer(object):
 
         # Save current best model
         if is_best:
-            best_model_file = os.path.join(self.save_dir, 'best.model')
+            best_model_file = os.path.join(self.save_dir,
+                                           ModelFile.TORCH_MODEL_BIN_FILE)
             torch.save(self.model.state_dict(), best_model_file)
-            best_train_file = os.path.join(self.save_dir, 'best.train')
+            best_train_file = os.path.join(
+                self.save_dir,
+                '{}.train'.format(ModelFile.TORCH_MODEL_BIN_FILE))
             torch.save(train_state, best_train_file)
             self.logger.info(
                 f"Saved best model state to '{best_model_file}' with new best valid metric "
@@ -324,8 +311,7 @@ class Trainer(object):
 
             self.func_model.load_state_dict(model_state_dict)
             self.logger.info(
-                f"Loaded model state from '{self.func_model.init_checkpoint}.model'"
-            )
+                f"Loaded model state from '{self.func_model.init_checkpoint}'")
 
         def _load_train_state():
             train_file = f'{self.func_model.init_checkpoint}.train'
@@ -558,19 +544,17 @@ class MultiWOZTrainer(Trainer):
                         generated_bs = outputs[0].cpu().numpy().tolist()
                         bspn_gen = self.decode_generated_bspn(generated_bs)
                         # check DB result
-                        if self.reader.use_true_db_pointer:  # To control whether current db is ground truth
+                        if self.reader.use_true_db_pointer:
                             db = turn['db']
                         else:
                             db_result = self.reader.bspan_to_DBpointer(
                                 self.tokenizer.decode(bspn_gen),
                                 turn['turn_domain'])
-                            assert len(turn['db']) == 4
-                            book_result = turn['db'][2]
+                            assert len(turn['db']) == 3
                             assert isinstance(db_result, str)
                             db = \
                                 [self.reader.sos_db_id] + \
                                 self.tokenizer.convert_tokens_to_ids([db_result]) + \
-                                [book_result] + \
                                 [self.reader.eos_db_id]
                             prompt_id = self.reader.sos_a_id
 
@@ -636,7 +620,7 @@ class MultiWOZTrainer(Trainer):
         score = 0.5 * (success + match) + bleu
 
         # log results
-        metrics_message = 'match: %2.2f  success: %2.2f  bleu: %2.2f  score: %.2f' %\
+        metrics_message = 'match: %2.2f  success: %2.2f  bleu: %2.2f  score: %.2f' % \
                           (match, success, bleu, score)
         message_prefix = f'[Infer][{self.epoch}]'
         time_cost = f'TIME-{time.time() - begin_time:.3f}'
diff --git a/modelscope/trainers/nlp_trainer.py b/modelscope/trainers/nlp_trainer.py
index 3692b486..4a14be31 100644
--- a/modelscope/trainers/nlp_trainer.py
+++ b/modelscope/trainers/nlp_trainer.py
@@ -1,6 +1,7 @@
 import os
-from typing import Callable, Dict, Optional, Tuple, Union
+from typing import Callable, Optional, Tuple, Union
 
+import numpy as np
 import torch
 from torch import nn
 from torch.utils.data import Dataset
@@ -11,9 +12,10 @@ from modelscope.metrics.builder import build_metric
 from modelscope.models.base import Model, TorchModel
 from modelscope.msdatasets import MsDataset
 from modelscope.preprocessors import Preprocessor, build_preprocessor
-from modelscope.utils.config import Config, ConfigDict
+from modelscope.utils.config import Config
 from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, ModeKeys,
                                        ModelFile, Tasks)
+from modelscope.utils.hub import parse_label_mapping
 from .base import TRAINERS
 from .trainer import EpochBasedTrainer
 
@@ -81,19 +83,32 @@ class NlpEpochBasedTrainer(EpochBasedTrainer):
             assert cfg_file is not None, 'Config file should not be None if model is an nn.Module class'
             model_dir = os.path.dirname(cfg_file)
 
+        self.label2id = None
+        self.id2label = None
+        self.num_labels = None
         self.cfg_modify_fn = cfg_modify_fn
         self.cfg = self.rebuild_config(Config.from_file(cfg_file))
-        try:
-            labels = self.cfg.dataset.train.labels
-        except AttributeError:
-            labels = None
 
-        self.label2id = None
-        self.num_labels = None
-        if labels is not None and len(labels) > 0:
-            self.label2id = {label: idx for idx, label in enumerate(labels)}
-            self.id2label = {idx: label for idx, label in enumerate(labels)}
-            self.num_labels = len(labels)
+        label2id = parse_label_mapping(model_dir)
+        if label2id is not None:
+            self.label2id = label2id
+            self.id2label = {id: label for label, id in label2id.items()}
+            self.num_labels = len(label2id)
+        else:
+            try:
+                labels = self.cfg.dataset.train.labels
+                if labels is not None and len(labels) > 0:
+                    self.label2id = {
+                        label: idx
+                        for idx, label in enumerate(labels)
+                    }
+                    self.id2label = {
+                        idx: label
+                        for idx, label in enumerate(labels)
+                    }
+                    self.num_labels = len(labels)
+            except AttributeError:
+                pass
 
         def build_dataset_keys(cfg):
             if cfg is not None:
@@ -130,7 +145,13 @@ class NlpEpochBasedTrainer(EpochBasedTrainer):
 
     def rebuild_config(self, cfg: Config):
         if self.cfg_modify_fn is not None:
-            return self.cfg_modify_fn(cfg)
+            cfg = self.cfg_modify_fn(cfg)
+        if not hasattr(cfg.model, 'label2id') and not hasattr(
+                cfg.model, 'id2label'):
+            if self.id2label is not None:
+                cfg.model['id2label'] = self.id2label
+            if self.label2id is not None:
+                cfg.model['label2id'] = self.label2id
         return cfg
 
     def build_model(self) -> Union[nn.Module, TorchModel]:
@@ -203,6 +224,9 @@ class VecoTrainer(NlpEpochBasedTrainer):
 
         """
         from modelscope.msdatasets.task_datasets import VecoDataset
+        if checkpoint_path is not None and os.path.isfile(checkpoint_path):
+            from modelscope.trainers.hooks import CheckpointHook
+            CheckpointHook.load_checkpoint(checkpoint_path, self)
         self.model.eval()
         self._mode = ModeKeys.EVAL
         metric_values = {}
@@ -223,12 +247,10 @@ class VecoTrainer(NlpEpochBasedTrainer):
                 self.eval_dataset, **self.cfg.evaluation.get('dataloader', {}))
             self.data_loader = self.eval_dataloader
 
-            metric_classes = [
-                build_metric(metric, default_args={'trainer': self})
-                for metric in self.metrics
-            ]
-            self.evaluation_loop(self.eval_dataloader, checkpoint_path,
-                                 metric_classes)
+            metric_classes = [build_metric(metric) for metric in self.metrics]
+            for m in metric_classes:
+                m.trainer = self
+            self.evaluation_loop(self.eval_dataloader, metric_classes)
 
             for m_idx, metric_cls in enumerate(metric_classes):
                 if f'eval_dataset[{idx}]' not in metric_values:
@@ -242,4 +264,8 @@ class VecoTrainer(NlpEpochBasedTrainer):
             else:
                 break
 
+        for metric_name in self.metrics:
+            metric_values[metric_name] = np.average(
+                [m[metric_name] for m in metric_values.values()])
+
         return metric_values
diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py
index 6bfdd2a4..d188ae6f 100644
--- a/modelscope/trainers/trainer.py
+++ b/modelscope/trainers/trainer.py
@@ -1,6 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
-import random
 import time
 from collections.abc import Mapping
 from distutils.version import LooseVersion
@@ -8,11 +7,9 @@ from functools import partial
 from typing import Callable, Dict, List, Optional, Sequence, Tuple, Union
 
 import json
-import numpy as np
 import torch
 from torch import distributed as dist
 from torch import nn
-from torch.nn.parallel import DistributedDataParallel
 from torch.utils.data import DataLoader, Dataset
 from torch.utils.data.dataloader import default_collate
 from torch.utils.data.distributed import DistributedSampler
@@ -40,7 +37,8 @@ from modelscope.utils.device import create_device, verify_device
 from modelscope.utils.file_utils import func_receive_dict_inputs
 from modelscope.utils.logger import get_logger
 from modelscope.utils.registry import build_from_cfg
-from modelscope.utils.torch_utils import get_dist_info, init_dist
+from modelscope.utils.torch_utils import (get_dist_info, init_dist,
+                                          set_random_seed)
 from .base import BaseTrainer
 from .builder import TRAINERS
 from .default_config import DEFAULT_CONFIG
@@ -75,6 +73,7 @@ class EpochBasedTrainer(BaseTrainer):
             this preprocessing action will be executed every time the dataset's __getitem__ is called.
         optimizers (`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler._LRScheduler]`, *optional*): A tuple
             containing the optimizer and the scheduler to use.
+        seed (int): The optional random seed for torch, cuda, numpy and random.
         max_epochs: (int, optional): Total training epochs.
     """
 
@@ -93,8 +92,11 @@ class EpochBasedTrainer(BaseTrainer):
                               torch.optim.lr_scheduler._LRScheduler] = (None,
                                                                         None),
             model_revision: Optional[str] = DEFAULT_MODEL_REVISION,
+            seed: int = 42,
             **kwargs):
 
+        self._seed = seed
+        set_random_seed(self._seed)
         if isinstance(model, str):
             if os.path.exists(model):
                 self.model_dir = model if os.path.isdir(
@@ -156,13 +158,18 @@ class EpochBasedTrainer(BaseTrainer):
         device_name = kwargs.get('device', 'gpu')
         verify_device(device_name)
         self.device = create_device(device_name)
+
         self.train_dataset = self.to_task_dataset(
             train_dataset,
             mode=ModeKeys.TRAIN,
+            task_data_config=self.cfg.dataset.get('train', None) if hasattr(
+                self.cfg, 'dataset') else None,
             preprocessor=self.train_preprocessor)
         self.eval_dataset = self.to_task_dataset(
             eval_dataset,
             mode=ModeKeys.EVAL,
+            task_data_config=self.cfg.dataset.get('val', None) if hasattr(
+                self.cfg, 'dataset') else None,
             preprocessor=self.eval_preprocessor)
 
         self.train_data_collator, self.eval_default_collate = None, None
@@ -199,6 +206,7 @@ class EpochBasedTrainer(BaseTrainer):
             self._max_epochs = self.cfg.train.max_epochs
         else:
             self._max_epochs = kwargs['max_epochs']
+
         self._train_iters_per_epoch = kwargs.get('train_iters_per_epoch', None)
         self._eval_iters_per_epoch = kwargs.get('val_iters_per_epoch', None)
         if self._train_iters_per_epoch is None and hasattr(
@@ -211,19 +219,16 @@ class EpochBasedTrainer(BaseTrainer):
 
         self.use_fp16 = kwargs.get('use_fp16', False)
 
-        # TODO @wenmeng.zwm add seed init fn
-        self._seed = 0
-
         if kwargs.get('launcher', None) is not None:
             init_dist(kwargs['launcher'])
 
         self._dist = get_dist_info()[1] > 1
+
         # model placement
         if self.device.type == 'cuda':
             self.model.to(self.device)
             if not is_parallel(self.model) and self._dist:
                 self.model = self.to_parallel(self.model)
-                self.device = self.model.device
 
     def rebuild_config(self, cfg: Config):
         """A method used to rebuild the config, any subclass can override this method.
@@ -295,6 +300,7 @@ class EpochBasedTrainer(BaseTrainer):
     def to_task_dataset(self,
                         datasets: Union[Dataset, List[Dataset]],
                         mode: str,
+                        task_data_config: Config = None,
                         preprocessor: Optional[Preprocessor] = None):
         """Build the task specific dataset processor for this trainer.
 
@@ -307,20 +313,29 @@ class EpochBasedTrainer(BaseTrainer):
             if isinstance(datasets, TorchTaskDataset):
                 return datasets
             elif isinstance(datasets, MsDataset):
-                cfg = ConfigDict(type=self.cfg.model.type, mode=mode) if hasattr(self.cfg, ConfigFields.model) \
-                    else ConfigDict(type=None, mode=mode)
+                if task_data_config is None:
+                    # adapt to some special models
+                    task_data_config = ConfigDict(
+                        type=self.cfg.model.type) if hasattr(
+                            self.cfg, ConfigFields.model) else ConfigDict(
+                                type=None)
+                task_data_config.update(dict(mode=mode))
                 return datasets.to_torch_dataset(
-                    task_data_config=cfg,
-                    task_name=self.cfg.task
-                    if hasattr(self.cfg, ConfigFields.task) else None,
+                    task_data_config=task_data_config,
+                    task_name=self.cfg.task,
                     preprocessors=preprocessor)
             elif isinstance(datasets, List) and isinstance(
                     datasets[0], MsDataset):
-                cfg = ConfigDict(type=self.cfg.model.type, mode=mode) if hasattr(self.cfg, ConfigFields.model) \
-                    else ConfigDict(type=None, mode=mode)
+                if task_data_config is None:
+                    # adapt to some special models
+                    task_data_config = ConfigDict(
+                        type=self.cfg.model.type) if hasattr(
+                            self.cfg, ConfigFields.model) else ConfigDict(
+                                type=None)
+                task_data_config.update(dict(mode=mode))
                 datasets = [
                     d.to_torch_dataset(
-                        task_data_config=cfg,
+                        task_data_config=task_data_config,
                         task_name=self.cfg.task,
                         preprocessors=preprocessor) for d in datasets
                 ]
@@ -328,12 +343,12 @@ class EpochBasedTrainer(BaseTrainer):
                     type=self.cfg.task, mode=mode, datasets=datasets)
                 return build_task_dataset(cfg, self.cfg.task)
             else:
-                cfg = ConfigDict(
-                    type=self.cfg.model.type,
-                    mode=mode,
-                    datasets=datasets,
-                    preprocessor=preprocessor)
-                return build_task_dataset(cfg, self.cfg.task)
+                # avoid add no str value datasets, preprocessors in cfg
+                task_data_build_config = ConfigDict(
+                    mode=mode, datasets=datasets, preprocessor=preprocessor)
+                task_data_build_config.update(task_data_config)
+                return build_task_dataset(task_data_build_config,
+                                          self.cfg.task)
         except Exception:
             if isinstance(datasets, (List, Tuple)) or preprocessor is not None:
                 return TorchTaskDataset(
@@ -408,8 +423,16 @@ class EpochBasedTrainer(BaseTrainer):
             metrics = [metrics]
         return metrics
 
-    def train(self, *args, **kwargs):
-        self.model.train()
+    def set_checkpoint_file_to_hook(self, checkpoint_path):
+        if checkpoint_path is not None and os.path.isfile(checkpoint_path):
+            from modelscope.trainers.hooks import CheckpointHook
+            checkpoint_hooks = list(
+                filter(lambda hook: isinstance(hook, CheckpointHook),
+                       self.hooks))
+            for hook in checkpoint_hooks:
+                hook.checkpoint_file = checkpoint_path
+
+    def train(self, checkpoint_path=None, *args, **kwargs):
         self._mode = ModeKeys.TRAIN
 
         if self.train_dataset is None:
@@ -425,12 +448,17 @@ class EpochBasedTrainer(BaseTrainer):
 
         self.register_optimizers_hook()
         self.register_hook_from_cfg(self.cfg.train.hooks)
+        self.set_checkpoint_file_to_hook(checkpoint_path)
+        self.model.train()
+
         self.train_loop(self.train_dataloader)
 
-    def evaluate(self, checkpoint_path=None, *arg, **kwargs):
+    def evaluate(self, checkpoint_path=None):
+        if checkpoint_path is not None and os.path.isfile(checkpoint_path):
+            from modelscope.trainers.hooks import CheckpointHook
+            CheckpointHook.load_checkpoint(checkpoint_path, self)
         self.model.eval()
         self._mode = ModeKeys.EVAL
-
         if self.eval_dataset is None:
             self.eval_dataloader = self.get_eval_data_loader()
         else:
@@ -444,8 +472,9 @@ class EpochBasedTrainer(BaseTrainer):
         metric_classes = [build_metric(metric) for metric in self.metrics]
         for m in metric_classes:
             m.trainer = self
+
         metric_values = self.evaluation_loop(self.eval_dataloader,
-                                             checkpoint_path, metric_classes)
+                                             metric_classes)
 
         self._metric_values = metric_values
         return metric_values
@@ -473,12 +502,13 @@ class EpochBasedTrainer(BaseTrainer):
             self.cfg.parallel.update(
                 dict(module=model, device_ids=[torch.cuda.current_device()]))
             return build_parallel(self.cfg.parallel)
-        model.to(f'cuda:{torch.cuda.current_device()}')
+
         dp_cfg = dict(
             type='DistributedDataParallel',
             module=model,
             find_unused_parameters=True,
             device_ids=[torch.cuda.current_device()])
+
         return build_parallel(dp_cfg)
 
     def train_step(self, model, inputs):
@@ -502,10 +532,8 @@ class EpochBasedTrainer(BaseTrainer):
         model.train()
         self._mode = ModeKeys.TRAIN
         # call model forward but not __call__ to skip postprocess
-        forward_func = model.module.forward if \
-            isinstance(model, DistributedDataParallel) else model.forward
         if isinstance(inputs,
-                      Mapping) and not func_receive_dict_inputs(forward_func):
+                      Mapping) and not func_receive_dict_inputs(model.forward):
             train_outputs = model.forward(**inputs)
         else:
             train_outputs = model.forward(inputs)
@@ -526,7 +554,7 @@ class EpochBasedTrainer(BaseTrainer):
                 value = train_outputs.get(key, None)
                 if value is not None:
                     if dist.is_available() and dist.is_initialized():
-                        value = value.data.clone()
+                        value = value.data.clone().to('cuda')
                         dist.all_reduce(value.div_(dist.get_world_size()))
                     log_vars.update({key: value.item()})
             self.log_buffer.update(log_vars)
@@ -615,11 +643,6 @@ class EpochBasedTrainer(BaseTrainer):
         if hasattr(data_cfg, 'name'):
             dataset = MsDataset.load(
                 dataset_name=data_cfg.name,
-                split=data_cfg.split,
-                subset_name=data_cfg.subset_name if hasattr(
-                    data_cfg, 'subset_name') else None,
-                hub=data_cfg.hub
-                if hasattr(data_cfg, 'hub') else Hubs.modelscope,
                 **data_cfg,
             )
             cfg = ConfigDict(type=self.cfg.model.type, mode=mode)
@@ -786,19 +809,22 @@ class EpochBasedTrainer(BaseTrainer):
         """ Training loop used by `EpochBasedTrainer.train()`
         """
         self.invoke_hook(TrainerStages.before_run)
-        self._epoch = 0
         kwargs = {}
         self.model.train()
         for _ in range(self._epoch, self._max_epochs):
             self.invoke_hook(TrainerStages.before_train_epoch)
             time.sleep(2)  # Prevent possible deadlock during epoch transition
             for i, data_batch in enumerate(data_loader):
+                if i < self.inner_iter:
+                    # inner_iter may be read out from the checkpoint file, so skip the trained iters in the epoch.
+                    continue
                 data_batch = to_device(data_batch, self.device)
                 self.data_batch = data_batch
                 self._inner_iter = i
                 self.invoke_hook(TrainerStages.before_train_iter)
                 self.train_step(self.model, data_batch, **kwargs)
                 self.invoke_hook(TrainerStages.after_train_iter)
+                # Value changed after the hooks are invoked, do not move them above the invoke_hook code.
                 del self.data_batch
                 self._iter += 1
                 self._mode = ModeKeys.TRAIN
@@ -807,12 +833,14 @@ class EpochBasedTrainer(BaseTrainer):
                     break
 
             self.invoke_hook(TrainerStages.after_train_epoch)
+            # Value changed after the hooks are invoked, do not move them above the invoke_hook code.
+            self._inner_iter = 0
             self._epoch += 1
 
         time.sleep(1)  # wait for some hooks like loggers to finish
         self.invoke_hook(TrainerStages.after_run)
 
-    def evaluation_loop(self, data_loader, checkpoint_path, metric_classes):
+    def evaluation_loop(self, data_loader, metric_classes):
         """ Evaluation loop used by `EpochBasedTrainer.evaluate()`.
 
         """
@@ -825,7 +853,7 @@ class EpochBasedTrainer(BaseTrainer):
                 tmpdir=None,
                 gpu_collect=False,
                 metric_classes=metric_classes,
-                data_loader_iters_per_gpu=self.iters_per_epoch)
+                data_loader_iters_per_gpu=self._eval_iters_per_epoch)
         else:
             from modelscope.trainers.utils.inference import single_gpu_test
             metric_values = single_gpu_test(
@@ -833,7 +861,7 @@ class EpochBasedTrainer(BaseTrainer):
                 data_loader,
                 device=self.device,
                 metric_classes=metric_classes,
-                data_loader_iters=self.iters_per_epoch)
+                data_loader_iters=self._eval_iters_per_epoch)
 
         self._inner_iter = self.iters_per_epoch - 1  # start from index 0
 
@@ -922,6 +950,4 @@ def worker_init_fn(worker_id, num_workers, rank, seed):
     # The seed of each worker equals to
     # num_worker * rank + worker_id + user_seed
     worker_seed = num_workers * rank + worker_id + seed
-    np.random.seed(worker_seed)
-    random.seed(worker_seed)
-    torch.manual_seed(worker_seed)
+    set_random_seed(worker_seed)
diff --git a/modelscope/utils/ast_utils.py b/modelscope/utils/ast_utils.py
index 263a81b3..f59100cb 100644
--- a/modelscope/utils/ast_utils.py
+++ b/modelscope/utils/ast_utils.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import ast
 import contextlib
 import hashlib
@@ -293,6 +295,9 @@ class AstScaning(object):
                     if type(attribute_node).__name__ == 'Str':
                         result.append((getattr(node,
                                                'arg'), attribute_node.s, None))
+                    elif type(attribute_node).__name__ == 'Constant':
+                        result.append(
+                            (getattr(node, 'arg'), attribute_node.value, None))
                     else:
                         result.append((getattr(node, 'arg'), )
                                       + _get_attribute_item(attribute_node))
@@ -394,7 +399,7 @@ class AstScaning(object):
 
     def generate_ast(self, file):
         self._refresh()
-        with open(file, 'r') as code:
+        with open(file, 'r', encoding='utf8') as code:
             data = code.readlines()
         data = ''.join(data)
 
diff --git a/modelscope/utils/audio/audio_utils.py b/modelscope/utils/audio/audio_utils.py
index 61964345..4c2c45cc 100644
--- a/modelscope/utils/audio/audio_utils.py
+++ b/modelscope/utils/audio/audio_utils.py
@@ -1,4 +1,10 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import struct
+from typing import Union
+from urllib.parse import urlparse
+
+from modelscope.fileio.file import HTTPStorage
+
 SEGMENT_LENGTH_TRAIN = 16000
 
 
@@ -29,3 +35,46 @@ def audio_norm(x):
     scalarx = 10**(-25 / 20) / rmsx
     x = x * scalarx
     return x
+
+
+def extract_pcm_from_wav(wav: bytes) -> bytes:
+    data = wav
+    if len(data) > 44:
+        frame_len = 44
+        file_len = len(data)
+        try:
+            header_fields = {}
+            header_fields['ChunkID'] = str(data[0:4], 'UTF-8')
+            header_fields['Format'] = str(data[8:12], 'UTF-8')
+            header_fields['Subchunk1ID'] = str(data[12:16], 'UTF-8')
+            if header_fields['ChunkID'] == 'RIFF' and header_fields[
+                    'Format'] == 'WAVE' and header_fields[
+                        'Subchunk1ID'] == 'fmt ':
+                header_fields['SubChunk1Size'] = struct.unpack(
+                    '<I', data[16:20])[0]
+
+                if header_fields['SubChunk1Size'] == 16:
+                    frame_len = 44
+                elif header_fields['SubChunk1Size'] == 18:
+                    frame_len = 46
+                else:
+                    return data
+
+                data = wav[frame_len:file_len]
+        except Exception:
+            # no treatment
+            pass
+
+    return data
+
+
+def load_bytes_from_url(url: str) -> Union[bytes, str]:
+    result = urlparse(url)
+    if result.scheme is not None and len(result.scheme) > 0:
+        storage = HTTPStorage()
+        data = storage.read(url)
+        data = extract_pcm_from_wav(data)
+    else:
+        data = url
+
+    return data
diff --git a/modelscope/utils/checkpoint.py b/modelscope/utils/checkpoint.py
index 425d3312..a9d7f396 100644
--- a/modelscope/utils/checkpoint.py
+++ b/modelscope/utils/checkpoint.py
@@ -8,14 +8,17 @@ from shutil import copytree, ignore_patterns, rmtree
 from typing import Callable, List, Optional, Union
 
 import json
-import numpy as np
 import torch
 from torch.optim import Optimizer
+from torch.optim.lr_scheduler import _LRScheduler
 
 from modelscope import __version__
 from modelscope.fileio import File, LocalStorage
 from modelscope.utils.config import JSONIteratorEncoder
 from modelscope.utils.constant import ConfigFields, ModelFile
+from modelscope.utils.logger import get_logger
+
+logger = get_logger(__name__)
 
 storage = LocalStorage()
 
@@ -40,24 +43,27 @@ def weights_to_cpu(state_dict):
 def save_checkpoint(model: torch.nn.Module,
                     filename: str,
                     optimizer: Optional[Optimizer] = None,
+                    lr_scheduler: Optional[_LRScheduler] = None,
                     meta: Optional[dict] = None,
                     with_meta: bool = True) -> None:
     """Save checkpoint to file.
 
     The checkpoint will have 3 fields: ``meta``, ``state_dict`` and
-    ``optimizer``. By default ``meta`` will contain version and time info.
+    ``optimizer``. By default, ``meta`` will contain version and time info.
 
     Args:
         model (Module): Module whose params are to be saved.
         filename (str): Checkpoint filename.
         optimizer (:obj:`Optimizer`, optional): Optimizer to be saved.
+        lr_scheduler(:obj:`_LRScheduler`, optional): LRScheduler to be saved.
         meta (dict, optional): Metadata to be saved in checkpoint.
+        with_meta (bool, optional):
     """
     if meta is None:
         meta = {}
     elif not isinstance(meta, dict):
         raise TypeError(f'meta must be a dict or None, but got {type(meta)}')
-    meta.update(modescope=__version__, time=time.asctime())
+    meta.update(modelscope=__version__, time=time.asctime())
 
     if isinstance(model, torch.nn.parallel.DistributedDataParallel):
         model = model.module
@@ -71,22 +77,68 @@ def save_checkpoint(model: torch.nn.Module,
             'meta': meta,
             'state_dict': weights_to_cpu(model.state_dict())
         }
+
+        # save optimizer state dict in the checkpoint
+        if isinstance(optimizer, Optimizer):
+            checkpoint['optimizer'] = optimizer.state_dict()
+        elif isinstance(optimizer, dict):
+            checkpoint['optimizer'] = {}
+            for name, optim in optimizer.items():
+                checkpoint['optimizer'][name] = optim.state_dict()
+
+        # save lr_scheduler state dict in the checkpoint
+        if lr_scheduler is not None and hasattr(lr_scheduler, 'state_dict'):
+            checkpoint['lr_scheduler'] = lr_scheduler.state_dict()
     else:
         checkpoint = weights_to_cpu(model.state_dict())
 
-    # save optimizer state dict in the checkpoint
-    if isinstance(optimizer, Optimizer):
-        checkpoint['optimizer'] = optimizer.state_dict()
-    elif isinstance(optimizer, dict):
-        checkpoint['optimizer'] = {}
-        for name, optim in optimizer.items():
-            checkpoint['optimizer'][name] = optim.state_dict()
-
     with io.BytesIO() as f:
         torch.save(checkpoint, f)
         File.write(f.getvalue(), filename)
 
 
+def load_checkpoint(filename,
+                    model,
+                    optimizer: Optimizer = None,
+                    lr_scheduler: _LRScheduler = None):
+    if not os.path.exists(filename):
+        raise ValueError(f'Checkpoint file {filename} does not exist!')
+    checkpoint = torch.load(filename, map_location='cpu')
+
+    if optimizer is not None:
+        if 'optimizer' in checkpoint:
+            if isinstance(optimizer, Optimizer):
+                optimizer.load_state_dict(checkpoint['optimizer'])
+            elif isinstance(optimizer, dict):
+                optimizer_dict = checkpoint['optimizer']
+                for key, optimizer_ins in optimizer.items():
+                    if key in optimizer_dict:
+                        optimizer_ins.load_state_dict(optimizer_dict[key])
+                    else:
+                        logger.warn(
+                            f'The state dict of optimizer {key} cannot be found in checkpoint file: {filename}'
+                        )
+        else:
+            logger.warn(
+                f'The state dict of optimizer cannot be found in checkpoint file: {filename}'
+            )
+
+    if lr_scheduler is not None:
+        if 'lr_scheduler' in checkpoint:
+            lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
+        else:
+            logger.warn(
+                f'The state dict of lr_scheduler cannot be found in checkpoint file: {filename}'
+            )
+
+    state_dict = checkpoint if 'state_dict' not in checkpoint else checkpoint[
+        'state_dict']
+    model.load_state_dict(state_dict)
+
+    if 'meta' in checkpoint:
+        return checkpoint.get('meta', {})
+
+
 def save_pretrained(model,
                     target_folder: Union[str, os.PathLike],
                     save_checkpoint_name: str = None,
diff --git a/modelscope/utils/config.py b/modelscope/utils/config.py
index 7d972118..0b966bef 100644
--- a/modelscope/utils/config.py
+++ b/modelscope/utils/config.py
@@ -1,4 +1,6 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright (c) OpenMMLab. All rights reserved.
+# Major implementation is borrowed and modified from
+# https://github.com/open-mmlab/mmcv/blob/master/mmcv/utils/config.py
 
 import copy
 import os
diff --git a/modelscope/utils/config_ds.py b/modelscope/utils/config_ds.py
index bafe3f99..fce823c4 100644
--- a/modelscope/utils/config_ds.py
+++ b/modelscope/utils/config_ds.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 from pathlib import Path
 
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 47d38dd7..d6b0da40 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -70,6 +70,9 @@ class CVTasks(object):
     crowd_counting = 'crowd-counting'
     movie_scene_segmentation = 'movie-scene-segmentation'
 
+    # video editing
+    video_inpainting = 'video-inpainting'
+
     # reid and tracking
     video_single_object_tracking = 'video-single-object-tracking'
     video_summarization = 'video-summarization'
@@ -86,6 +89,8 @@ class NLPTasks(object):
     sentiment_analysis = 'sentiment-analysis'
     sentence_similarity = 'sentence-similarity'
     text_classification = 'text-classification'
+    sentence_embedding = 'sentence-embedding'
+    passage_ranking = 'passage-ranking'
     relation_extraction = 'relation-extraction'
     zero_shot = 'zero-shot'
     translation = 'translation'
@@ -133,6 +138,22 @@ class MultiModalTasks(object):
     image_text_retrieval = 'image-text-retrieval'
 
 
+class TasksIODescriptions(object):
+    image_to_image = 'image_to_image',
+    images_to_image = 'images_to_image',
+    image_to_text = 'image_to_text',
+    seed_to_image = 'seed_to_image',
+    text_to_speech = 'text_to_speech',
+    text_to_text = 'text_to_text',
+    speech_to_text = 'speech_to_text',
+    speech_to_speech = 'speech_to_speech'
+    speeches_to_speech = 'speeches_to_speech',
+    visual_grounding = 'visual_grounding',
+    visual_question_answering = 'visual_question_answering',
+    visual_entailment = 'visual_entailment',
+    generative_multi_modal_embedding = 'generative_multi_modal_embedding'
+
+
 class Tasks(CVTasks, NLPTasks, AudioTasks, MultiModalTasks):
     """ Names for tasks supported by modelscope.
 
@@ -225,6 +246,7 @@ class ModelFile(object):
     ONNX_MODEL_FILE = 'model.onnx'
     LABEL_MAPPING = 'label_mapping.json'
     TRAIN_OUTPUT_DIR = 'output'
+    TS_MODEL_FILE = 'model.ts'
 
 
 class ConfigFields(object):
diff --git a/modelscope/utils/cv/image_utils.py b/modelscope/utils/cv/image_utils.py
index cb07ba1a..98ba533e 100644
--- a/modelscope/utils/cv/image_utils.py
+++ b/modelscope/utils/cv/image_utils.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import cv2
 import numpy as np
 
@@ -66,8 +68,8 @@ def draw_joints(image, np_kps, score, threshold=0.2):
 
 
 def draw_box(image, box):
-    cv2.rectangle(image, (int(box[0][0]), int(box[0][1])),
-                  (int(box[1][0]), int(box[1][1])), (0, 0, 255), 2)
+    cv2.rectangle(image, (int(box[0]), int(box[1])),
+                  (int(box[2]), int(box[3])), (0, 0, 255), 2)
 
 
 def realtime_object_detection_bbox_vis(image, bboxes):
@@ -89,6 +91,27 @@ def draw_keypoints(output, original_image):
     return image
 
 
+def draw_face_detection_no_lm_result(img_path, detection_result):
+    bboxes = np.array(detection_result[OutputKeys.BOXES])
+    scores = np.array(detection_result[OutputKeys.SCORES])
+    img = cv2.imread(img_path)
+    assert img is not None, f"Can't read img: {img_path}"
+    for i in range(len(scores)):
+        bbox = bboxes[i].astype(np.int32)
+        x1, y1, x2, y2 = bbox
+        score = scores[i]
+        cv2.rectangle(img, (x1, y1), (x2, y2), (255, 0, 0), 2)
+        cv2.putText(
+            img,
+            f'{score:.2f}', (x1, y2),
+            1,
+            1.0, (0, 255, 0),
+            thickness=1,
+            lineType=8)
+    print(f'Found {len(scores)} faces')
+    return img
+
+
 def draw_facial_expression_result(img_path, facial_expression_result):
     label_idx = facial_expression_result[OutputKeys.LABELS]
     map_list = [
diff --git a/modelscope/utils/demo_utils.py b/modelscope/utils/demo_utils.py
new file mode 100644
index 00000000..41ac0bca
--- /dev/null
+++ b/modelscope/utils/demo_utils.py
@@ -0,0 +1,245 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import io
+
+import cv2
+import json
+import numpy as np
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks, TasksIODescriptions
+
+TASKS_INPUT_TEMPLATES = {
+    # vision tasks
+    Tasks.image_portrait_stylization: TasksIODescriptions.image_to_image,
+    Tasks.portrait_matting: TasksIODescriptions.image_to_image,
+    Tasks.skin_retouching: TasksIODescriptions.image_to_image,
+    Tasks.image_captioning: TasksIODescriptions.image_to_text,
+    Tasks.image_denoising: TasksIODescriptions.image_to_image,
+    Tasks.image_portrait_enhancement: TasksIODescriptions.image_to_image,
+    Tasks.image_super_resolution: TasksIODescriptions.image_to_image,
+    Tasks.image_colorization: TasksIODescriptions.image_to_image,
+    Tasks.image_color_enhancement: TasksIODescriptions.image_to_image,
+    Tasks.face_image_generation: TasksIODescriptions.seed_to_image,
+    Tasks.image_style_transfer: TasksIODescriptions.images_to_image,
+    Tasks.image_segmentation: TasksIODescriptions.image_to_text,
+    Tasks.image_object_detection: TasksIODescriptions.image_to_text,
+
+    # not tested
+    Tasks.image_classification: TasksIODescriptions.image_to_text,
+    Tasks.ocr_detection: TasksIODescriptions.image_to_text,
+    Tasks.ocr_recognition: TasksIODescriptions.image_to_text,
+    Tasks.body_2d_keypoints: TasksIODescriptions.image_to_text,
+
+    # nlp tasks
+    Tasks.text_classification: TasksIODescriptions.text_to_text,
+    Tasks.text_generation: TasksIODescriptions.text_to_text,
+    Tasks.word_segmentation: TasksIODescriptions.text_to_text,
+    Tasks.text_error_correction: TasksIODescriptions.text_to_text,
+    Tasks.named_entity_recognition: TasksIODescriptions.text_to_text,
+    Tasks.sentiment_classification: TasksIODescriptions.text_to_text,
+
+    # audio tasks
+    Tasks.text_to_speech: TasksIODescriptions.text_to_speech,
+    Tasks.auto_speech_recognition: TasksIODescriptions.speech_to_text,
+    Tasks.keyword_spotting: TasksIODescriptions.speech_to_text,
+    Tasks.acoustic_noise_suppression: TasksIODescriptions.speech_to_speech,
+    Tasks.acoustic_echo_cancellation: TasksIODescriptions.speeches_to_speech,
+
+    # multi-modal
+    Tasks.visual_grounding: TasksIODescriptions.visual_grounding,
+    Tasks.visual_question_answering:
+    TasksIODescriptions.visual_question_answering,
+    Tasks.visual_entailment: TasksIODescriptions.visual_entailment,
+    Tasks.generative_multi_modal_embedding:
+    TasksIODescriptions.generative_multi_modal_embedding,
+
+    # new tasks
+    Tasks.virtual_try_on: TasksIODescriptions.images_to_image,
+
+    # TODO(lingcai.wl): support more tasks and implement corresponding example
+}
+
+INPUT_EXAMPLES = {
+    # Must align with task schema defined in the Widget section of model card=
+    # cv
+    TasksIODescriptions.image_to_image: {
+        'inputs': [
+            'https://modelscope.oss-cn-beijing.aliyuncs.com/test/images/image_cartoon.png'
+        ],
+        'urlPaths': {
+            'outUrls': [{
+                'outputKey': OutputKeys.OUTPUT_IMG,
+                'fileType': 'png'
+            }]
+        }
+    },
+    TasksIODescriptions.images_to_image: {
+        'inputs': [
+            'https://modelscope.oss-cn-beijing.aliyuncs.com/demo/image-style-transfer/style_transfer_content.jpg',
+            'https://modelscope.oss-cn-beijing.aliyuncs.com/demo/image-style-transfer/style_transfer_style.jpg'
+        ],
+        'urlPaths': {
+            'outUrls': [{
+                'outputKey': OutputKeys.OUTPUT_IMG,
+                'fileType': 'png'
+            }]
+        }
+    },
+    TasksIODescriptions.image_to_text: {
+        'inputs': [
+            'https://modelscope.oss-cn-beijing.aliyuncs.com/test/images/image_cartoon.png'
+        ],
+        'urlPaths': {}
+    },
+    # nlp
+    TasksIODescriptions.text_to_text: {
+        'inputs': ['test'],
+        'urlPaths': {}
+    },
+
+    # audio
+    TasksIODescriptions.speech_to_text: {
+        'inputs': [
+            'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example.wav'
+        ],
+        'urlPaths': {}
+    },
+    TasksIODescriptions.text_to_speech: {
+        'inputs': ['北京今天天气怎么样'],
+        'urlPaths': {
+            'outUrls': [{
+                'outputKey': OutputKeys.OUTPUT_PCM,
+                'fileType': 'pcm'
+            }]
+        }
+    },
+    TasksIODescriptions.speeches_to_speech: {
+        'inputs': [
+            'http://225252-file.oss-cn-hangzhou-zmf.aliyuncs.com/maas_demo/nearend_mic.wav',
+            'http://225252-file.oss-cn-hangzhou-zmf.aliyuncs.com/maas_demo/nearend_speech.wav'
+        ],
+        'urlPaths': {
+            'outUrls': [{
+                'outputKey': OutputKeys.OUTPUT_PCM,
+                'fileType': 'wav'
+            }]
+        }
+    },
+    TasksIODescriptions.speech_to_speech: {
+        'inputs': [
+            'http://225252-file.oss-cn-hangzhou-zmf.aliyuncs.com/maas_demo/speech_with_noise.wav'
+        ],
+        'urlPaths': {
+            'outUrls': [{
+                'outputKey': OutputKeys.OUTPUT_PCM,
+                'fileType': 'wav'
+            }]
+        }
+    },
+
+    # multi modal
+    TasksIODescriptions.visual_grounding: {
+        'task':
+        Tasks.visual_grounding,
+        'inputs': [
+            'http://xingchen-data.oss-cn-zhangjiakou.aliyuncs.com/maas/visual-grounding/visual_grounding.png',
+            'a blue turtle-like pokemon with round head'
+        ],
+        'urlPaths': {}
+    },
+    TasksIODescriptions.visual_question_answering: {
+        'task':
+        Tasks.visual_question_answering,
+        'inputs': [
+            'http://225252-file.oss-cn-hangzhou-zmf.aliyuncs.com/maas_demo/visual_question_answering.png',
+            'what is grown on the plant?'
+        ],
+        'urlPaths': {}
+    },
+    TasksIODescriptions.visual_entailment: {
+        'task':
+        Tasks.visual_entailment,
+        'inputs': [
+            'http://xingchen-data.oss-cn-zhangjiakou.aliyuncs.com/maas/visual-entailment/visual_entailment.jpg',
+            'there are two birds.', 'test'
+        ],
+        'urlPaths': {}
+    },
+    TasksIODescriptions.generative_multi_modal_embedding: {
+        'task':
+        Tasks.generative_multi_modal_embedding,
+        'inputs': [
+            'http://clip-multimodal.oss-cn-beijing.aliyuncs.com/lingchen/demo/dogs.jpg',
+            'dogs playing in the grass'
+        ],
+        'urlPaths': {}
+    },
+}
+
+
+class DemoCompatibilityCheck(object):
+
+    def compatibility_check(self):
+        if self.task not in TASKS_INPUT_TEMPLATES:
+            print('task is not supported in demo service so far')
+            return False
+        if TASKS_INPUT_TEMPLATES[self.task] not in INPUT_EXAMPLES:
+            print('no example input for this task')
+            return False
+
+        print('testing demo: ', self.task, self.model_id)
+        test_pipline = pipeline(self.task, self.model_id)
+        req = INPUT_EXAMPLES[TASKS_INPUT_TEMPLATES[self.task]]
+        output = test_pipline(preprocess(req))
+        json.dumps(output, cls=NumpyEncoder)
+        result = postprocess(req, output)
+        print(result)
+        return True
+
+
+class NumpyEncoder(json.JSONEncoder):
+
+    def default(self, obj):
+        if isinstance(obj, np.ndarray):
+            return obj.tolist()
+
+        if isinstance(obj, np.floating):
+            return float(obj)
+
+        if isinstance(obj, np.integer):
+            return int(obj)
+
+        return json.JSONEncoder.default(self, obj)
+
+
+def preprocess(req):
+    if len(req['inputs']) == 1:
+        inputs = req['inputs'][0]
+    else:
+        inputs = tuple(req['inputs'])
+    return inputs
+
+
+def postprocess(req, resp):
+    out_urls = req.get('urlPaths').get('outUrls')
+    if out_urls is None or len(out_urls) == 0:
+        return resp
+    new_resp = resp
+    if isinstance(resp, str):
+        new_resp = json.loads(resp)
+    for out_url in out_urls:
+        output_key = out_url['outputKey']
+        file_type = out_url['fileType']
+        new_resp.get(output_key)
+        if file_type == 'png' or file_type == 'jpg':
+            content = new_resp.get(output_key)
+            _, img_encode = cv2.imencode('.' + file_type, content)
+            img_bytes = img_encode.tobytes()
+            return type(img_bytes)
+        else:
+            out_mem_file = io.BytesIO()
+            out_mem_file.write(new_resp.get(output_key))
+            return type(out_mem_file)
+        # TODO(lingcai.wl): support more file type
diff --git a/modelscope/utils/device.py b/modelscope/utils/device.py
index df5470f9..4bbd09d8 100644
--- a/modelscope/utils/device.py
+++ b/modelscope/utils/device.py
@@ -19,10 +19,12 @@ def verify_device(device_name):
     Return:
         device info (tuple):  device_type and device_id, if device_id is not set, will use 0 as default.
     """
+    err_msg = 'device should be either cpu, cuda, gpu, gpu:X or cuda:X where X is the ordinal for gpu device.'
+    assert device_name is not None and device_name != '', err_msg
     device_name = device_name.lower()
     eles = device_name.split(':')
-    err_msg = 'device should be either cpu, cuda, gpu, gpu:X or cuda:X where X is the ordinal for gpu device.'
     assert len(eles) <= 2, err_msg
+    assert device_name is not None
     assert eles[0] in ['cpu', 'cuda', 'gpu'], err_msg
     device_type = eles[0]
     device_id = None
diff --git a/modelscope/utils/error.py b/modelscope/utils/error.py
index e7d1442f..a6bbc8b3 100644
--- a/modelscope/utils/error.py
+++ b/modelscope/utils/error.py
@@ -96,3 +96,18 @@ DECORD_IMPORT_ERROR = """
 {0} requires the decord library but it was not found in your environment. You can install it with pip:
 `pip install decord>=0.6.0`
 """
+
+# docstyle-ignore
+DEEPSPEED_IMPORT_ERROR = """
+{0} requires the Deepspeed library but it was not found in your environment. Checkout the instructions on the
+installation page: https://www.deepspeed.ai/tutorials/advanced-install/ and follow the ones that match your environment.
+"""
+
+# docstyle-ignore
+FAIRSEQ_IMPORT_ERROR = """
+{0} requires the fairseq library but it was not found in your environment.
+You can install it with pip on linux:
+`pip install fairseq`
+On windows, please checkout the instructions on the
+installation page: https://github.com/facebookresearch/fairseq and follow the ones that match your environment.
+"""
diff --git a/modelscope/utils/hub.py b/modelscope/utils/hub.py
index f79097fe..2dbe7045 100644
--- a/modelscope/utils/hub.py
+++ b/modelscope/utils/hub.py
@@ -77,19 +77,26 @@ def auto_load(model: Union[str, List[str]]):
 def get_model_type(model_dir):
     """Get the model type from the configuration.
 
-    This method will try to get the 'model.type' or 'model.model_type' field from the configuration.json file.
-    If this file does not exist, the method will try to get the 'model_type' field from the config.json.
+    This method will try to get the model type from 'model.backbone.type',
+    'model.type' or 'model.model_type' field in the configuration.json file. If
+    this file does not exist, the method will try to get the 'model_type' field
+    from the config.json.
 
-    @param model_dir: The local model dir to use.
-    @return: The model type string, returns None if nothing is found.
+    @param model_dir: The local model dir to use. @return: The model type
+    string, returns None if nothing is found.
     """
     try:
         configuration_file = osp.join(model_dir, ModelFile.CONFIGURATION)
         config_file = osp.join(model_dir, 'config.json')
         if osp.isfile(configuration_file):
             cfg = Config.from_file(configuration_file)
-            return cfg.model.model_type if hasattr(cfg.model, 'model_type') and not hasattr(cfg.model, 'type') \
-                else cfg.model.type
+            if hasattr(cfg.model, 'backbone'):
+                return cfg.model.backbone.type
+            elif hasattr(cfg.model,
+                         'model_type') and not hasattr(cfg.model, 'type'):
+                return cfg.model.model_type
+            else:
+                return cfg.model.type
         elif osp.isfile(config_file):
             cfg = Config.from_file(config_file)
             return cfg.model_type if hasattr(cfg, 'model_type') else None
@@ -123,13 +130,24 @@ def parse_label_mapping(model_dir):
         if hasattr(config, ConfigFields.model) and hasattr(
                 config[ConfigFields.model], 'label2id'):
             label2id = config[ConfigFields.model].label2id
+        elif hasattr(config, ConfigFields.model) and hasattr(
+                config[ConfigFields.model], 'id2label'):
+            id2label = config[ConfigFields.model].id2label
+            label2id = {label: id for id, label in id2label.items()}
         elif hasattr(config, ConfigFields.preprocessor) and hasattr(
                 config[ConfigFields.preprocessor], 'label2id'):
             label2id = config[ConfigFields.preprocessor].label2id
+        elif hasattr(config, ConfigFields.preprocessor) and hasattr(
+                config[ConfigFields.preprocessor], 'id2label'):
+            id2label = config[ConfigFields.preprocessor].id2label
+            label2id = {label: id for id, label in id2label.items()}
 
-    if label2id is None:
-        config_path = os.path.join(model_dir, 'config.json')
+    config_path = os.path.join(model_dir, 'config.json')
+    if label2id is None and os.path.exists(config_path):
         config = Config.from_file(config_path)
         if hasattr(config, 'label2id'):
             label2id = config.label2id
+        elif hasattr(config, 'id2label'):
+            id2label = config.id2label
+            label2id = {label: id for id, label in id2label.items()}
     return label2id
diff --git a/modelscope/utils/import_utils.py b/modelscope/utils/import_utils.py
index c9bea020..2a6fdc80 100644
--- a/modelscope/utils/import_utils.py
+++ b/modelscope/utils/import_utils.py
@@ -290,6 +290,8 @@ REQUIREMENTS_MAAPING = OrderedDict([
     ('easyasr', (is_package_available('easyasr'), AUDIO_IMPORT_ERROR)),
     ('kwsbp', (is_package_available('kwsbp'), AUDIO_IMPORT_ERROR)),
     ('decord', (is_package_available('decord'), DECORD_IMPORT_ERROR)),
+    ('deepspeed', (is_package_available('deepspeed'), DEEPSPEED_IMPORT_ERROR)),
+    ('fairseq', (is_package_available('fairseq'), FAIRSEQ_IMPORT_ERROR)),
 ])
 
 SYSTEM_PACKAGE = set(['os', 'sys', 'typing'])
diff --git a/modelscope/utils/model_tag.py b/modelscope/utils/model_tag.py
new file mode 100644
index 00000000..7065e8f3
--- /dev/null
+++ b/modelscope/utils/model_tag.py
@@ -0,0 +1,184 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import logging
+import os
+
+import json
+import requests
+
+from modelscope.version import __version__
+
+
+# 打标
+class ModelTag(object):
+    _URL = os.environ.get('MODEL_TAG_URL', None)
+
+    # 模型测试结果
+    BATCH_COMMIT_RESULT_URL = f'{_URL}/batchCommitResult'
+    # 测试阶段完成
+    BATCH_REFRESH_STAGE_URL = f'{_URL}/batchRefreshStage'
+    # query_model_stage
+    QUERY_MODEL_STAGE_URL = f'{_URL}/queryModelStage'
+
+    HEADER = {'Content-Type': 'application/json'}
+
+    # 检测结果
+    MODEL_SKIP = 0
+    MODEL_FAIL = 1
+    MODEL_PASS = 2
+
+    class ItemResult(object):
+
+        def __init__(self):
+            self.result = 0
+            self.name = ''
+            self.info = ''
+
+        def to_json(self):
+            return {
+                'name': self.name,
+                'result': self.result,
+                'info': self.info
+            }
+
+    def __init__(self):
+        self.job_name = ''
+        self.job_id = ''
+        self.model = ''
+        self.sdk_version = ''
+        self.image_version = ''
+        self.domain = ''
+        self.task = ''
+        self.source = ''
+        self.stage = ''
+        # ItemResult list
+        self.item_result = []
+
+    # 发送请求
+    def _post_request(self, url, param):
+        try:
+            logging.info(url + ' query: '
+                         + str(json.dumps(param, ensure_ascii=False)))
+            res = requests.post(
+                url=url,
+                headers=self.HEADER,
+                data=json.dumps(param, ensure_ascii=False).encode('utf8'))
+            if res.status_code == 200:
+                logging.info(f'{url} post结果: ' + res.text)
+                res_json = json.loads(res.text)
+                if int(res_json['errorCode']) == 200:
+                    return res_json['content']
+                else:
+                    logging.error(res.text)
+            else:
+                logging.error(res.text)
+        except Exception as e:
+            logging.error(e)
+
+        return None
+
+    # 提交模型测试结果
+    def batch_commit_result(self):
+        try:
+            param = {
+                'sdkVersion':
+                self.sdk_version,
+                'imageVersion':
+                self.image_version,
+                'source':
+                self.source,
+                'jobName':
+                self.job_name,
+                'jobId':
+                self.job_id,
+                'modelList': [{
+                    'model': self.model,
+                    'domain': self.domain,
+                    'task': self.task,
+                    'itemResult': self.item_result
+                }]
+            }
+            return self._post_request(self.BATCH_COMMIT_RESULT_URL, param)
+
+        except Exception as e:
+            logging.error(e)
+
+        return
+
+    # 测试阶段完成
+    def batch_refresh_stage(self):
+        try:
+            param = {
+                'sdkVersion':
+                self.sdk_version,
+                'imageVersion':
+                self.image_version,
+                'source':
+                self.source,
+                'stage':
+                self.stage,
+                'modelList': [{
+                    'model': self.model,
+                    'domain': self.domain,
+                    'task': self.task
+                }]
+            }
+            return self._post_request(self.BATCH_REFRESH_STAGE_URL, param)
+
+        except Exception as e:
+            logging.error(e)
+
+        return
+
+    # 查询模型某个阶段的最新测试结果（只返回单个结果
+    def query_model_stage(self):
+        try:
+            param = {
+                'sdkVersion': self.sdk_version,
+                'model': self.model,
+                'stage': self.stage,
+                'imageVersion': self.image_version
+            }
+            return self._post_request(self.QUERY_MODEL_STAGE_URL, param)
+
+        except Exception as e:
+            logging.error(e)
+
+        return None
+
+    # 提交模型UT测试结果
+    """
+        model_tag = ModelTag()
+        model_tag.model = "XXX"
+        model_tag.sdk_version = "0.3.7"
+        model_tag.domain = "nlp"
+        model_tag.task = "word-segmentation"
+        item = model_tag.ItemResult()
+        item.result = model_tag.MODEL_PASS
+        item.name = "ALL"
+        item.info = ""
+        model_tag.item_result.append(item.to_json())
+    """
+
+    def commit_ut_result(self):
+        if self._URL is not None and self._URL != '':
+            self.job_name = 'UT'
+            self.source = 'dev'
+            self.stage = 'integration'
+
+            self.batch_commit_result()
+            self.batch_refresh_stage()
+
+
+def commit_model_ut_result(model_name, ut_result):
+    model_tag = ModelTag()
+    model_tag.model = model_name.replace('damo/', '')
+    model_tag.sdk_version = __version__
+    # model_tag.domain = ""
+    # model_tag.task = ""
+    item = model_tag.ItemResult()
+    item.result = ut_result
+    item.name = 'ALL'
+    item.info = ''
+    model_tag.item_result.append(item.to_json())
+    model_tag.commit_ut_result()
diff --git a/modelscope/utils/nlp/__init__.py b/modelscope/utils/nlp/__init__.py
index e69de29b..62c0b888 100644
--- a/modelscope/utils/nlp/__init__.py
+++ b/modelscope/utils/nlp/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .utils import import_external_nltk_data
+
+else:
+    _import_structure = {
+        'utils': ['import_external_nltk_data'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/utils/nlp/distributed.py b/modelscope/utils/nlp/distributed.py
new file mode 100755
index 00000000..2b590a10
--- /dev/null
+++ b/modelscope/utils/nlp/distributed.py
@@ -0,0 +1,130 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import torch
+import torch.distributed as dist
+from megatron import mpu
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+from torch.autograd import Variable
+from torch.nn.modules import Module
+
+from modelscope.utils.torch_utils import init_dist
+
+
+def initialize_distributed(rank, mpu, world_size, model_parallel_size,
+                           master_ip, master_port):
+    """Initialize torch.distributed."""
+    # Manually set the device ids.
+    device = rank % torch.cuda.device_count()
+    torch.cuda.set_device(device)
+    # Call the init process
+    init_method = 'tcp://'
+    init_method += master_ip + ':' + master_port
+    torch.distributed.init_process_group(
+        backend='nccl', world_size=8, rank=rank, init_method=init_method)
+    # Set the model-parallel communicators.
+    mpu.initialize_model_parallel(model_parallel_size)
+
+
+def normal_init_method(mean, std):
+
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=mean, std=std)
+
+    return init_
+
+
+def scaled_init_method(mean, std, num_layers):
+    """Init method based on N(0, sigma/sqrt(2*num_layers)."""
+    std = std / math.sqrt(2.0 * num_layers)
+
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=mean, std=std)
+
+    return init_
+
+
+class DistributedDataParallel(Module):
+
+    def __init__(self, module):
+        super(DistributedDataParallel, self).__init__()
+        self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
+
+        self.module = module
+        self.data_parallel_group = mpu.get_data_parallel_group()
+        src_rank = mpu.get_model_parallel_rank()
+        for p in self.module.parameters():
+            if torch.is_tensor(p):
+                dist.broadcast(p, src_rank, group=self.data_parallel_group)
+
+        def allreduce_params(reduce_after=True,
+                             no_scale=False,
+                             fp32_allreduce=False):
+            if (self.needs_reduction):
+                self.needs_reduction = False
+                buckets = {}
+                for name, param in self.module.named_parameters():
+                    if param.requires_grad and param.grad is not None:
+                        tp = (param.data.type())
+                        if tp not in buckets:
+                            buckets[tp] = []
+                        buckets[tp].append(param)
+                if self.warn_on_half:
+                    if torch.cuda.HalfTensor in buckets:
+                        print(
+                            'WARNING: gloo dist backend for half parameters may be extremely slow.',
+                            'It is recommended to use the NCCL backend in this case.'
+                        )
+                        self.warn_on_half = False
+                for tp in buckets:
+                    bucket = buckets[tp]
+                    grads = [param.grad.data for param in bucket]
+                    coalesced = _flatten_dense_tensors(grads)
+                    if fp32_allreduce:
+                        coalesced = coalesced.float()
+                    if not no_scale and not reduce_after:
+                        coalesced /= dist.get_world_size(
+                            group=self.data_parallel_group)
+                    dist.all_reduce(coalesced, group=self.data_parallel_group)
+                    torch.cuda.synchronize()
+                    if not no_scale and reduce_after:
+                        coalesced /= dist.get_world_size(
+                            group=self.data_parallel_group)
+                    for buf, synced in zip(
+                            grads, _unflatten_dense_tensors(coalesced, grads)):
+                        buf.copy_(synced)
+
+        self.hook_handles = []
+        self.hooks = []
+        for param in list(self.module.parameters()):
+
+            def allreduce_hook(*unused):
+                Variable._execution_engine.queue_callback(allreduce_params)
+
+        self.allreduce_params = allreduce_params
+
+    def forward(self, *inputs, **kwargs):
+        self.needs_reduction = True
+        return self.module(*inputs, **kwargs)
+
+    def state_dict(self, destination=None, prefix='', keep_vars=False):
+        sd = self.module.state_dict(destination, prefix, keep_vars)
+
+        return sd
+
+    def load_state_dict(self, state_dict, strict=True):
+        self.module.load_state_dict(state_dict, strict=strict)
diff --git a/modelscope/utils/nlp/load_checkpoint.py b/modelscope/utils/nlp/load_checkpoint.py
new file mode 100755
index 00000000..6534e18d
--- /dev/null
+++ b/modelscope/utils/nlp/load_checkpoint.py
@@ -0,0 +1,117 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import torch
+
+
+def load_checkpoint(model,
+                    load_dir,
+                    tag,
+                    load_module_strict=True,
+                    load_optimizer_states=True,
+                    load_lr_scheduler_states=True):
+    r"""Load training checkpoint
+
+    Arguments:
+        load_dir: Required. Directory to load the checkpoint from
+        tag: Required. Checkpoint tag used as a unique identifier for the checkpoint. Ex. Global Step.
+        load_module_strict: Optional. Boolean to strictly enforce that the keys in state_dict of module and
+         checkpoint match.
+        load_optimizer_states: Optional. Boolean to load the training optimizer states from Checkpoint.
+         Ex. ADAM's momentum and variance
+        load_lr_scheduler_states: Optional. Boolean to add the learning rate scheduler states from Checkpoint.
+    Return:
+        load_path: Path of the loaded checkpoint. None if loading the checkpoint failed
+        client_state: State dictionary used for loading required training states in the client code.
+    """
+
+    load_path, client_states = _load_checkpoint(
+        model,
+        load_dir,
+        tag,
+        load_module_strict=load_module_strict,
+        load_optimizer_states=load_optimizer_states,
+        load_lr_scheduler_states=load_lr_scheduler_states)
+
+    if load_optimizer_states:
+        if model.zero_optimization() and load_path is not None:
+            model._load_zero_checkpoint(
+                load_dir, tag, load_optimizer_states=load_optimizer_states)
+
+    return load_path, client_states
+
+
+def _get_ckpt_name(mpu, checkpoints_path, tag):
+    mp_rank = 0 if mpu is None else mpu.get_model_parallel_rank()
+    ckpt_name = os.path.join(
+        checkpoints_path, str(tag),
+        'mp_rank_{:02d}'.format(mp_rank) + '_model_states.pt')
+    return ckpt_name
+
+
+def pre_load(mpu, load_dir, tag=''):
+    load_path = _get_ckpt_name(mpu, load_dir, tag)
+    checkpoint = torch.load(
+        load_path, map_location=lambda storage, loc: storage)
+    return checkpoint['module']
+
+
+def _load_checkpoint(model,
+                     load_dir,
+                     tag,
+                     load_module_strict=True,
+                     load_optimizer_states=True,
+                     load_lr_scheduler_states=True):
+
+    load_path = model._get_ckpt_name(load_dir, tag)
+
+    if not os.path.exists(load_path):
+        return None, None
+
+    checkpoint = torch.load(
+        load_path, map_location=lambda storage, loc: storage)
+
+    model.load_module_state_dict(
+        state_dict=checkpoint['module'], strict=load_module_strict)
+    if not model.zero_optimization() and load_optimizer_states:
+        if model.fp16_enabled():
+            model.optimizer.load_state_dict(
+                checkpoint['optimizer'],
+                load_optimizer_states=load_optimizer_states)
+        elif load_optimizer_states:
+            model.optimizer.load_state_dict(checkpoint['optimizer'])
+
+    if load_lr_scheduler_states and model.lr_scheduler is not None:
+        model.lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
+
+    model.csr_tensor_module_names = checkpoint['csr_tensor_module_names']
+    model.global_steps = checkpoint['global_steps']
+    model.global_samples = checkpoint.get(
+        'global_samples', model.global_steps * model.train_batch_size())
+    model.skipped_steps = checkpoint['skipped_steps']
+    model.loaded_checkpoint_mp_world_size = checkpoint['mp_world_size']
+    model.loaded_checkpoint_dp_world_size = checkpoint['dp_world_size']
+    deepspeed_states = [
+        'module', 'optimizer', 'lr_scheduler', 'csr_tensor_module_names',
+        'skipped_steps', 'global_steps', 'dp_world_size', 'mp_world_size'
+    ]
+    client_state = {
+        key: value
+        for key, value in checkpoint.items() if key not in deepspeed_states
+    }
+
+    return load_path, client_state
diff --git a/modelscope/utils/nlp/nlp_utils.py b/modelscope/utils/nlp/nlp_utils.py
index 35b374f2..eba12103 100644
--- a/modelscope/utils/nlp/nlp_utils.py
+++ b/modelscope/utils/nlp/nlp_utils.py
@@ -2,7 +2,8 @@ from typing import List
 
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.nlp import (ConversationalTextToSqlPipeline,
-                                      DialogStateTrackingPipeline)
+                                      DialogStateTrackingPipeline,
+                                      TableQuestionAnsweringPipeline)
 
 
 def text2sql_tracking_and_print_results(
@@ -41,3 +42,17 @@ def tracking_and_print_dialog_states(
         print(json.dumps(result))
 
         history_states.extend([result[OutputKeys.OUTPUT], {}])
+
+
+def tableqa_tracking_and_print_results(
+        test_case, pipelines: List[TableQuestionAnsweringPipeline]):
+    for pipeline in pipelines:
+        historical_queries = None
+        for question in test_case['utterance']:
+            output_dict = pipeline({
+                'question': question,
+                'history_sql': historical_queries
+            })
+            print('output_dict', output_dict['output'].string,
+                  output_dict['output'].query)
+            historical_queries = output_dict['history']
diff --git a/modelscope/utils/nlp/space/clean_dataset.py b/modelscope/utils/nlp/space/clean_dataset.py
new file mode 100644
index 00000000..4578ccc4
--- /dev/null
+++ b/modelscope/utils/nlp/space/clean_dataset.py
@@ -0,0 +1,333 @@
+import os
+import re
+
+from . import ontology
+
+
+def clean_text_split_dot(text):
+    text = re.sub(r'([a-zT]+)\.([a-z])', r'\1 . \2',
+                  text)  # 'abc.xyz' -> 'abc . xyz'
+    text = re.sub(r'(\w+)\.\.? ', r'\1 . ', text)  # if 'abc. ' -> 'abc . '
+    return text
+
+
+def clean_text(data_dir, text):
+    text = text.strip()
+    text = text.lower()
+    text = text.replace(u'’', "'")
+    text = text.replace(u'‘', "'")
+    text = text.replace(';', ',')
+    text = text.replace('"', ' ')
+    text = text.replace('/', ' and ')
+    text = text.replace("don't", "do n't")
+    text = clean_time(text)
+    baddata = {
+        r'c\.b (\d), (\d) ([a-z])\.([a-z])': r'cb\1\2\3\4',
+        'c.b. 1 7 d.y': 'cb17dy',
+        'c.b.1 7 d.y': 'cb17dy',
+        'c.b 25, 9 a.q': 'cb259aq',
+        'isc.b 25, 9 a.q': 'is cb259aq',
+        'c.b2, 1 u.f': 'cb21uf',
+        'c.b 1,2 q.a': 'cb12qa',
+        '0-122-336-5664': '01223365664',
+        'postcodecb21rs': 'postcode cb21rs',
+        r'i\.d': 'id',
+        ' i d ': 'id',
+        'Telephone:01223358966': 'Telephone: 01223358966',
+        'depature': 'departure',
+        'depearting': 'departing',
+        '-type': ' type',
+        r'b[\s]?&[\s]?b': 'bed and breakfast',
+        'b and b': 'bed and breakfast',
+        r'guesthouse[s]?': 'guest house',
+        r'swimmingpool[s]?': 'swimming pool',
+        "wo n\'t": 'will not',
+        " \'d ": ' would ',
+        " \'m ": ' am ',
+        " \'re' ": ' are ',
+        " \'ll' ": ' will ',
+        " \'ve ": ' have ',
+        r'^\'': '',
+        r'\'$': '',
+    }
+    for tmpl, good in baddata.items():
+        text = re.sub(tmpl, good, text)
+
+    text = re.sub(r'([a-zT]+)\.([a-z])', r'\1 . \2',
+                  text)  # 'abc.xyz' -> 'abc . xyz'
+    text = re.sub(r'(\w+)\.\.? ', r'\1 . ', text)  # if 'abc. ' -> 'abc . '
+
+    with open(os.path.join(data_dir, 'mapping.pair'), 'r') as fin:
+        for line in fin.readlines():
+            fromx, tox = line.replace('\n', '').split('\t')
+            text = ' ' + text + ' '
+            text = text.replace(' ' + fromx + ' ', ' ' + tox + ' ')[1:-1]
+
+    return text
+
+
+def clean_time(utter):
+    utter = re.sub(r'(\d+) ([ap]\.?m)', lambda x: x.group(1) + x.group(2),
+                   utter)  # 9 am -> 9am
+    utter = re.sub(r'((?<!\d)\d:\d+)(am)?', r'0\1', utter)
+    utter = re.sub(r'((?<!\d)\d)am', r'0\1:00', utter)
+    utter = re.sub(r'((?<!\d)\d)pm',
+                   lambda x: str(int(x.group(1)) + 12) + ':00', utter)
+    utter = re.sub(r'(\d+)(:\d+)pm',
+                   lambda x: str(int(x.group(1)) + 12) + x.group(2), utter)
+    utter = re.sub(r'(\d+)a\.?m', r'\1', utter)
+    return utter
+
+
+def clean_slot_values(data_dir, domain, slot, value):
+    value = clean_text(data_dir, value)
+    if not value:
+        value = ''
+    elif value == 'not mentioned':
+        value = ''
+        # value = 'not mentioned' # if in DST setting
+    elif domain == 'attraction':
+        if slot == 'name':
+            if value == 't':
+                value = ''
+            if value == 'trinity':
+                value = 'trinity college'
+        elif slot == 'area':
+            if value in ['town centre', 'cent', 'center', 'ce']:
+                value = 'centre'
+            elif value in [
+                    'ely', 'in town', 'museum', 'norwich', 'same area as hotel'
+            ]:
+                value = ''
+            elif value in ['we']:
+                value = 'west'
+        elif slot == 'type':
+            if value in ['m', 'mus', 'musuem']:
+                value = 'museum'
+            elif value in ['art', 'architectural']:
+                value = 'architecture'
+            elif value in ['churches']:
+                value = 'church'
+            elif value in ['coll']:
+                value = 'college'
+            elif value in ['concert', 'concerthall']:
+                value = 'concert hall'
+            elif value in ['night club']:
+                value = 'nightclub'
+            elif value in [
+                    'mutiple sports', 'mutliple sports', 'sports', 'galleria'
+            ]:
+                value = 'multiple sports'
+            elif value in ['ol', 'science', 'gastropub', 'la raza']:
+                value = ''
+            elif value in ['swimmingpool', 'pool']:
+                value = 'swimming pool'
+            elif value in ['fun']:
+                value = 'entertainment'
+
+    elif domain == 'hotel':
+        if slot == 'area':
+            if value in [
+                    'cen', 'centre of town', 'near city center', 'center'
+            ]:
+                value = 'centre'
+            elif value in ['east area', 'east side']:
+                value = 'east'
+            elif value in ['in the north', 'north part of town']:
+                value = 'north'
+            elif value in ['we']:
+                value = 'west'
+        elif slot == 'day':
+            if value == 'monda':
+                value = 'monday'
+            elif value == 't':
+                value = 'tuesday'
+        elif slot == 'name':
+            if value == 'uni':
+                value = 'university arms hotel'
+            elif value == 'university arms':
+                value = 'university arms hotel'
+            elif value == 'acron':
+                value = 'acorn guest house'
+            elif value == 'ashley':
+                value = 'ashley hotel'
+            elif value == 'arbury lodge guesthouse':
+                value = 'arbury lodge guest house'
+            elif value == 'la':
+                value = 'la margherit'
+            elif value == 'no':
+                value = ''
+        elif slot == 'internet':
+            if value == 'does not':
+                value = 'no'
+            elif value in ['y', 'free', 'free internet']:
+                value = 'yes'
+            elif value in ['4']:
+                value = ''
+        elif slot == 'parking':
+            if value == 'n':
+                value = 'no'
+            elif value in ['free parking']:
+                value = 'yes'
+            elif value in ['y']:
+                value = 'yes'
+        elif slot in ['pricerange', 'price range']:
+            slot = 'pricerange'
+            if value == 'moderately':
+                value = 'moderate'
+            elif value in ['any']:
+                value = "do n't care"
+            elif value in ['any']:
+                value = "do n't care"
+            elif value in ['inexpensive']:
+                value = 'cheap'
+            elif value in ['2', '4']:
+                value = ''
+        elif slot == 'stars':
+            if value == 'two':
+                value = '2'
+            elif value == 'three':
+                value = '3'
+            elif value in [
+                    '4-star', '4 stars', '4 star', 'four star', 'four stars'
+            ]:
+                value = '4'
+        elif slot == 'type':
+            if value == '0 star rarting':
+                value = ''
+            elif value == 'guesthouse':
+                value = 'guest house'
+            elif value not in ['hotel', 'guest house', "do n't care"]:
+                value = ''
+    elif domain == 'restaurant':
+        if slot == 'area':
+            if value in [
+                    'center', 'scentre', 'center of town', 'city center',
+                    'cb30aq', 'town center', 'centre of cambridge',
+                    'city centre'
+            ]:
+                value = 'centre'
+            elif value == 'west part of town':
+                value = 'west'
+            elif value == 'n':
+                value = 'north'
+            elif value in ['the south']:
+                value = 'south'
+            elif value not in [
+                    'centre', 'south', "do n't care", 'west', 'east', 'north'
+            ]:
+                value = ''
+        elif slot == 'day':
+            if value == 'monda':
+                value = 'monday'
+            elif value == 't':
+                value = 'tuesday'
+        elif slot in ['pricerange', 'price range']:
+            slot = 'pricerange'
+            if value in ['moderately', 'mode', 'mo']:
+                value = 'moderate'
+            elif value in ['not']:
+                value = ''
+            elif value in ['inexpensive', 'ch']:
+                value = 'cheap'
+        elif slot == 'food':
+            if value == 'barbecue':
+                value = 'barbeque'
+        elif slot == 'pricerange':
+            if value == 'moderately':
+                value = 'moderate'
+        elif slot == 'time':
+            if value == '9:00':
+                value = '09:00'
+            elif value == '9:45':
+                value = '09:45'
+            elif value == '1330':
+                value = '13:30'
+            elif value == '1430':
+                value = '14:30'
+            elif value == '9:15':
+                value = '09:15'
+            elif value == '9:30':
+                value = '09:30'
+            elif value == '1830':
+                value = '18:30'
+            elif value == '9':
+                value = '09:00'
+            elif value == '2:00':
+                value = '14:00'
+            elif value == '1:00':
+                value = '13:00'
+            elif value == '3:00':
+                value = '15:00'
+    elif domain == 'taxi':
+        if slot in ['arriveBy', 'arrive by']:
+            slot = 'arriveby'
+            if value == '1530':
+                value = '15:30'
+            elif value == '15 minutes':
+                value = ''
+        elif slot in ['leaveAt', 'leave at']:
+            slot = 'leaveat'
+            if value == '1:00':
+                value = '01:00'
+            elif value == '21:4':
+                value = '21:04'
+            elif value == '4:15':
+                value = '04:15'
+            elif value == '5:45':
+                value = '05:45'
+            elif value == '0700':
+                value = '07:00'
+            elif value == '4:45':
+                value = '04:45'
+            elif value == '8:30':
+                value = '08:30'
+            elif value == '9:30':
+                value = '09:30'
+            value = value.replace('.', ':')
+
+    elif domain == 'train':
+        if slot in ['arriveBy', 'arrive by']:
+            slot = 'arriveby'
+            if value == '1':
+                value = '01:00'
+            elif value in ['does not care', 'doesnt care', "doesn't care"]:
+                value = "do n't care"
+            elif value == '8:30':
+                value = '08:30'
+            elif value == 'not 15:45':
+                value = ''
+            value = value.replace('.', ':')
+        elif slot == 'day':
+            if value == 'doesnt care' or value == "doesn't care":
+                value = "do n't care"
+        elif slot in ['leaveAt', 'leave at']:
+            slot = 'leaveat'
+            if value == '2:30':
+                value = '02:30'
+            elif value == '7:54':
+                value = '07:54'
+            elif value == 'after 5:45 pm':
+                value = '17:45'
+            elif value in [
+                    'early evening', 'friday', 'sunday', 'tuesday', 'afternoon'
+            ]:
+                value = ''
+            elif value == '12':
+                value = '12:00'
+            elif value == '1030':
+                value = '10:30'
+            elif value == '1700':
+                value = '17:00'
+            elif value in [
+                    'does not care', 'doesnt care', 'do nt care',
+                    "doesn't care"
+            ]:
+                value = "do n't care"
+
+            value = value.replace('.', ':')
+    if value in ['dont care', "don't care", 'do nt care', "doesn't care"]:
+        value = "do n't care"
+    if ontology.normlize_slot_names.get(slot):
+        slot = ontology.normlize_slot_names[slot]
+    return slot, value
diff --git a/modelscope/utils/nlp/space/utils.py b/modelscope/utils/nlp/space/utils.py
index ef38684a..81d1b1c5 100644
--- a/modelscope/utils/nlp/space/utils.py
+++ b/modelscope/utils/nlp/space/utils.py
@@ -4,8 +4,11 @@ from collections import OrderedDict
 import json
 import numpy as np
 
+from modelscope.utils.logger import get_logger
 from . import ontology
 
+logger = get_logger()
+
 
 def max_lens(X):
     lens = [len(X)]
@@ -117,8 +120,8 @@ class MultiWOZVocab(object):
     def construct(self):
         freq_dict_sorted = sorted(
             self._freq_dict.keys(), key=lambda x: -self._freq_dict[x])
-        print('Vocabulary size including oov: %d' %
-              (len(freq_dict_sorted) + len(self._idx2word)))
+        logger.info('Vocabulary size including oov: %d' %
+                    (len(freq_dict_sorted) + len(self._idx2word)))
         if len(freq_dict_sorted) + len(self._idx2word) < self.vocab_size:
             logging.warning(
                 'actual label set smaller than that configured: {}/{}'.format(
@@ -148,8 +151,9 @@ class MultiWOZVocab(object):
         for w, idx in self._word2idx.items():
             self._idx2word[idx] = w
         self.vocab_size_oov = len(self._idx2word)
-        print('vocab file loaded from "' + vocab_path + '"')
-        print('Vocabulary size including oov: %d' % (self.vocab_size_oov))
+        logger.info('vocab file loaded from "' + vocab_path + '"')
+        logger.info('Vocabulary size including oov: %d' %
+                    (self.vocab_size_oov))
 
     def save_vocab(self, vocab_path):
         _freq_dict = OrderedDict(
diff --git a/modelscope/utils/nlp/utils.py b/modelscope/utils/nlp/utils.py
new file mode 100644
index 00000000..13a21480
--- /dev/null
+++ b/modelscope/utils/nlp/utils.py
@@ -0,0 +1,20 @@
+import os.path as osp
+
+
+def import_external_nltk_data(nltk_data_dir, package_name):
+    """import external nltk_data, and extract nltk zip package.
+
+    Args:
+        nltk_data_dir (str): external nltk_data dir path, eg. /home/xx/nltk_data
+        package_name (str): nltk package name, eg. tokenizers/punkt
+    """
+    import nltk
+    nltk.data.path.append(nltk_data_dir)
+
+    filepath = osp.join(nltk_data_dir, package_name + '.zip')
+    zippath = osp.join(nltk_data_dir, package_name)
+    packagepath = osp.dirname(zippath)
+    if not osp.exists(zippath):
+        import zipfile
+        with zipfile.ZipFile(filepath) as zf:
+            zf.extractall(osp.join(packagepath))
diff --git a/modelscope/utils/regress_test_utils.py b/modelscope/utils/regress_test_utils.py
index ca50d579..47bbadfe 100644
--- a/modelscope/utils/regress_test_utils.py
+++ b/modelscope/utils/regress_test_utils.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import contextlib
 import hashlib
 import os
@@ -133,6 +135,7 @@ class RegressTool:
                              compare_fn=None,
                              ignore_keys=None,
                              compare_random=True,
+                             reset_dropout=True,
                              lazy_stop_callback=None):
         """Monitor a pytorch module's backward data and cfg data within a step of the optimizer.
 
@@ -151,6 +154,7 @@ class RegressTool:
         @param compare_fn: A custom fn used to compare the results manually.
         @param ignore_keys: The keys to ignore of the named_parameters.
         @param compare_random: If to compare random setttings, default True.
+        @param reset_dropout: Reset all dropout modules to 0.0.
         @param lazy_stop_callback: A callback passed in, when the moniting is over, this callback will be called.
 
         >>> def compare_fn(v1, v2, key, type):
@@ -202,6 +206,18 @@ class RegressTool:
             trainer,
             '_seed') else trainer.seed if hasattr(trainer, 'seed') else None
 
+        if reset_dropout:
+            with torch.no_grad():
+
+                def reinit_dropout(_module):
+                    for name, submodule in _module.named_children():
+                        if isinstance(submodule, torch.nn.Dropout):
+                            setattr(_module, name, torch.nn.Dropout(0.))
+                        else:
+                            reinit_dropout(submodule)
+
+                reinit_dropout(module)
+
         if level == 'strict':
             hack_forward(module, file_name, io_json)
             intercept_module(module, io_json)
@@ -285,19 +301,23 @@ class MsRegressTool(RegressTool):
                          file_name,
                          level='config',
                          compare_fn=None,
-                         ignore_keys=None):
+                         ignore_keys=None,
+                         compare_random=True,
+                         lazy_stop_callback=None):
 
-        def lazy_stop_callback():
+        if lazy_stop_callback is None:
 
-            from modelscope.trainers.hooks.hook import Hook, Priority
+            def lazy_stop_callback():
 
-            class EarlyStopHook(Hook):
-                PRIORITY = Priority.VERY_LOW
+                from modelscope.trainers.hooks.hook import Hook, Priority
 
-                def after_iter(self, trainer):
-                    raise MsRegressTool.EarlyStopError('Test finished.')
+                class EarlyStopHook(Hook):
+                    PRIORITY = Priority.VERY_LOW
 
-            trainer.register_hook(EarlyStopHook())
+                    def after_iter(self, trainer):
+                        raise MsRegressTool.EarlyStopError('Test finished.')
+
+                trainer.register_hook(EarlyStopHook())
 
         def _train_loop(trainer, *args, **kwargs):
             with self.monitor_module_train(
@@ -306,6 +326,7 @@ class MsRegressTool(RegressTool):
                     level,
                     compare_fn=compare_fn,
                     ignore_keys=ignore_keys,
+                    compare_random=compare_random,
                     lazy_stop_callback=lazy_stop_callback):
                 try:
                     return trainer.train_loop_origin(*args, **kwargs)
@@ -331,10 +352,10 @@ def numpify_tensor_nested(tensors, reduction=None, clip_value=10000):
         return type(tensors)(
             numpify_tensor_nested(t, reduction, clip_value) for t in tensors)
     if isinstance(tensors, Mapping):
-        return type(tensors)({
+        return {
             k: numpify_tensor_nested(t, reduction, clip_value)
             for k, t in tensors.items()
-        })
+        }
     if isinstance(tensors, torch.Tensor):
         t: np.ndarray = tensors.cpu().numpy()
         if clip_value is not None:
@@ -354,9 +375,7 @@ def detach_tensor_nested(tensors):
     if isinstance(tensors, (list, tuple)):
         return type(tensors)(detach_tensor_nested(t) for t in tensors)
     if isinstance(tensors, Mapping):
-        return type(tensors)(
-            {k: detach_tensor_nested(t)
-             for k, t in tensors.items()})
+        return {k: detach_tensor_nested(t) for k, t in tensors.items()}
     if isinstance(tensors, torch.Tensor):
         return tensors.detach()
     return tensors
@@ -475,7 +494,11 @@ def intercept_module(module: nn.Module,
         intercept_module(module, io_json, full_name, restore)
 
 
-def compare_arguments_nested(print_content, arg1, arg2):
+def compare_arguments_nested(print_content,
+                             arg1,
+                             arg2,
+                             rtol=1.e-3,
+                             atol=1.e-8):
     type1 = type(arg1)
     type2 = type(arg2)
     if type1.__name__ != type2.__name__:
@@ -494,7 +517,7 @@ def compare_arguments_nested(print_content, arg1, arg2):
             return False
         return True
     elif isinstance(arg1, (float, np.floating)):
-        if not np.isclose(arg1, arg2, rtol=1.e-3, atol=1.e-8, equal_nan=True):
+        if not np.isclose(arg1, arg2, rtol=rtol, atol=atol, equal_nan=True):
             if print_content is not None:
                 print(f'{print_content}, arg1:{arg1}, arg2:{arg2}')
             return False
@@ -541,7 +564,7 @@ def compare_arguments_nested(print_content, arg1, arg2):
         arg2 = np.where(np.equal(arg2, None), np.NaN,
                         arg2).astype(dtype=np.float)
         if not all(
-                np.isclose(arg1, arg2, rtol=1.e-3, atol=1.e-8,
+                np.isclose(arg1, arg2, rtol=rtol, atol=atol,
                            equal_nan=True).flatten()):
             if print_content is not None:
                 print(f'{print_content}')
diff --git a/modelscope/utils/tensor_utils.py b/modelscope/utils/tensor_utils.py
index 7889d944..b68a639c 100644
--- a/modelscope/utils/tensor_utils.py
+++ b/modelscope/utils/tensor_utils.py
@@ -1,15 +1,24 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # Part of the implementation is borrowed from huggingface/transformers.
-from collections.abc import Mapping
-
-import numpy as np
+from collections import Mapping
 
 
 def torch_nested_numpify(tensors):
+    """ Numpify nested torch tensors.
+
+    NOTE: If the type of input tensors is dict-like(Mapping, dict, OrderedDict, etc.), the return type will be dict.
+
+    @param tensors: Nested torch tensors.
+    @return: The numpify tensors.
+    """
+
     import torch
     "Numpify `tensors` (even if it's a nested list/tuple of tensors)."
     if isinstance(tensors, (list, tuple)):
         return type(tensors)(torch_nested_numpify(t) for t in tensors)
+    if isinstance(tensors, Mapping):
+        # return dict
+        return {k: torch_nested_numpify(t) for k, t in tensors.items()}
     if isinstance(tensors, torch.Tensor):
         t = tensors.cpu()
         return t.numpy()
@@ -17,10 +26,20 @@ def torch_nested_numpify(tensors):
 
 
 def torch_nested_detach(tensors):
+    """ Detach nested torch tensors.
+
+    NOTE: If the type of input tensors is dict-like(Mapping, dict, OrderedDict, etc.), the return type will be dict.
+
+    @param tensors: Nested torch tensors.
+    @return: The detached tensors.
+    """
+
     import torch
     "Detach `tensors` (even if it's a nested list/tuple of tensors)."
     if isinstance(tensors, (list, tuple)):
         return type(tensors)(torch_nested_detach(t) for t in tensors)
+    if isinstance(tensors, Mapping):
+        return {k: torch_nested_detach(t) for k, t in tensors.items()}
     if isinstance(tensors, torch.Tensor):
         return tensors.detach()
     return tensors
diff --git a/modelscope/utils/test_utils.py b/modelscope/utils/test_utils.py
index 7adba982..5109db11 100644
--- a/modelscope/utils/test_utils.py
+++ b/modelscope/utils/test_utils.py
@@ -11,12 +11,13 @@ import sys
 import tarfile
 import tempfile
 import unittest
+from collections import OrderedDict
 
 import requests
-from datasets import Dataset
+import torch
 from datasets.config import TF_AVAILABLE, TORCH_AVAILABLE
+from torch.utils.data import Dataset
 
-from modelscope.msdatasets import MsDataset
 from .torch_utils import _find_free_port
 
 TEST_LEVEL = 2
@@ -48,9 +49,25 @@ def set_test_level(level: int):
     TEST_LEVEL = level
 
 
+class DummyTorchDataset(Dataset):
+
+    def __init__(self, feat, label, num) -> None:
+        self.feat = feat
+        self.label = label
+        self.num = num
+
+    def __getitem__(self, index):
+        return {
+            'feat': torch.Tensor(self.feat),
+            'labels': torch.Tensor(self.label)
+        }
+
+    def __len__(self):
+        return self.num
+
+
 def create_dummy_test_dataset(feat, label, num):
-    return MsDataset.from_hf_dataset(
-        Dataset.from_dict(dict(feat=[feat] * num, labels=[label] * num)))
+    return DummyTorchDataset(feat, label, num)
 
 
 def download_and_untar(fpath, furl, dst) -> str:
@@ -71,6 +88,37 @@ def download_and_untar(fpath, furl, dst) -> str:
     return target_dir_path
 
 
+def get_case_model_info():
+    status_code, result = subprocess.getstatusoutput(
+        'grep -rn "damo/" tests/  | grep -v ".pyc" | grep -v "Binary file" | grep -v run.py '
+    )
+    lines = result.split('\n')
+    test_cases = OrderedDict()
+    model_cases = OrderedDict()
+    for line in lines:
+        # "tests/msdatasets/test_ms_dataset.py:92:        model_id = 'damo/bert-base-sst2'"
+        line = line.strip()
+        elements = line.split(':')
+        test_file = elements[0]
+        model_pos = line.find('damo')
+        left_quote = line[model_pos - 1]
+        rquote_idx = line.rfind(left_quote)
+        model_name = line[model_pos:rquote_idx]
+        if test_file not in test_cases:
+            test_cases[test_file] = set()
+        model_info = test_cases[test_file]
+        model_info.add(model_name)
+
+        if model_name not in model_cases:
+            model_cases[model_name] = set()
+        case_info = model_cases[model_name]
+        case_info.add(
+            test_file.replace('tests/', '').replace('.py',
+                                                    '').replace('/', '.'))
+
+    return model_cases
+
+
 _DIST_SCRIPT_TEMPLATE = """
 import ast
 import argparse
diff --git a/modelscope/utils/torch_utils.py b/modelscope/utils/torch_utils.py
index 45e33c3e..6d4132f6 100644
--- a/modelscope/utils/torch_utils.py
+++ b/modelscope/utils/torch_utils.py
@@ -3,16 +3,16 @@
 import functools
 import os
 import pickle
+import random
 import socket
 import subprocess
 import tempfile
 from typing import Callable, List, Optional, Tuple
 
+import numpy as np
 import torch
 import torch.multiprocessing as mp
 from torch import distributed as dist
-from torch._utils import (_flatten_dense_tensors, _take_tensors,
-                          _unflatten_dense_tensors)
 
 
 def _find_free_port() -> str:
@@ -49,7 +49,6 @@ def init_dist(launcher: str, backend: str = 'nccl', **kwargs) -> None:
 def _init_dist_pytorch(backend: str, **kwargs) -> None:
     # rank = int(os.environ['RANK'])
     local_rank = int(os.environ['LOCAL_RANK'])
-
     torch.cuda.set_device(local_rank)
     dist.init_process_group(backend=backend, **kwargs)
 
@@ -180,3 +179,20 @@ def broadcast(inputs, src):
     dist.broadcast(inputs_tensor, src)
 
     return pickle.loads(inputs_tensor.cpu().numpy().tobytes())
+
+
+def set_random_seed(seed):
+    if seed is not None and seed >= 0:
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+    else:
+        raise ValueError(
+            f'Random seed should be positive, current seed is {seed}')
+
+
+def set_random_seed_mpu(seed):
+    from megatron import mpu
+    set_random_seed(seed)
+    mpu.model_parallel_cuda_manual_seed(seed)
diff --git a/modelscope/utils/type_assert.py b/modelscope/utils/type_assert.py
index aaeadcb9..f732a81a 100644
--- a/modelscope/utils/type_assert.py
+++ b/modelscope/utils/type_assert.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from functools import wraps
 from inspect import signature
 
diff --git a/modelscope/version.py b/modelscope/version.py
index d93912ee..908c0bb7 100644
--- a/modelscope/version.py
+++ b/modelscope/version.py
@@ -1 +1 @@
-__version__ = '0.3.7'
+__version__ = '0.4.3'
diff --git a/requirements/multi-modal.txt b/requirements/multi-modal.txt
index ef5d4341..02e87baa 100644
--- a/requirements/multi-modal.txt
+++ b/requirements/multi-modal.txt
@@ -1,4 +1,3 @@
-fairseq
 ftfy>=6.0.3
 ofa>=0.0.2
 pycocoevalcap>=1.2
diff --git a/requirements/nlp.txt b/requirements/nlp.txt
index ada4fc50..15f2f41a 100644
--- a/requirements/nlp.txt
+++ b/requirements/nlp.txt
@@ -1,6 +1,6 @@
 en_core_web_sm>=2.3.5
-fairseq>=0.10.2
 jieba>=0.42.1
+megatron_util
 pai-easynlp
 # rough-score was just recently updated from 0.0.4 to 0.0.7
 # which introduced compatability issues that are being investigated
diff --git a/setup.cfg b/setup.cfg
index c98dbe05..3dc64f86 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -19,7 +19,7 @@ quiet-level = 3
 ignore-words-list = patten,nd,ty,mot,hist,formating,winn,gool,datas,wan,confids
 
 [flake8]
-select = B,C,E,F,P,T4,W,B9
 max-line-length = 120
-ignore = F401,F405,F821,W503
+select = B,C,E,F,P,T4,W,B9
+ignore = F401,F405,F821,W503,E251
 exclude = docs/src,*.pyi,.git
diff --git a/tests/export/__init__.py b/tests/export/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/export/test_export_sbert_sequence_classification.py b/tests/export/test_export_sbert_sequence_classification.py
new file mode 100644
index 00000000..535b3f5d
--- /dev/null
+++ b/tests/export/test_export_sbert_sequence_classification.py
@@ -0,0 +1,37 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import shutil
+import tempfile
+import unittest
+
+from modelscope.exporters import Exporter, TorchModelExporter
+from modelscope.models.base import Model
+from modelscope.utils.test_utils import test_level
+
+
+class TestExportSbertSequenceClassification(unittest.TestCase):
+
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+        self.model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base'
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp_dir)
+        super().tearDown()
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_export_sbert_sequence_classification(self):
+        model = Model.from_pretrained(self.model_id)
+        print(
+            Exporter.from_model(model).export_onnx(
+                shape=(2, 256), outputs=self.tmp_dir))
+        print(
+            TorchModelExporter.from_model(model).export_torch_script(
+                shape=(2, 256), outputs=self.tmp_dir))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/hub/test_hub_examples.py b/tests/hub/test_hub_examples.py
index 3fb6823f..d1f7594e 100644
--- a/tests/hub/test_hub_examples.py
+++ b/tests/hub/test_hub_examples.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import unittest
 
 from modelscope.hub.api import HubApi
diff --git a/tests/hub/test_hub_private_repository.py b/tests/hub/test_hub_private_repository.py
index 8683a884..dab2b891 100644
--- a/tests/hub/test_hub_private_repository.py
+++ b/tests/hub/test_hub_private_repository.py
@@ -10,7 +10,8 @@ from modelscope.hub.errors import GitError
 from modelscope.hub.repository import Repository
 from modelscope.utils.constant import ModelFile
 from .test_utils import (TEST_ACCESS_TOKEN1, TEST_ACCESS_TOKEN2,
-                         TEST_MODEL_CHINESE_NAME, TEST_MODEL_ORG)
+                         TEST_MODEL_CHINESE_NAME, TEST_MODEL_ORG,
+                         delete_credential)
 
 DEFAULT_GIT_PATH = 'git'
 
@@ -65,6 +66,18 @@ class HubPrivateRepositoryTest(unittest.TestCase):
         print(repo2.model_dir)
         assert repo1.model_dir == repo2.model_dir
 
+    def test_clone_private_model_without_token(self):
+        delete_credential()
+        temporary_dir = tempfile.mkdtemp()
+        local_dir = os.path.join(temporary_dir, self.model_name)
+        with self.assertRaises(GitError) as cm:
+            Repository(local_dir, clone_from=self.model_id)
+
+        print(cm.exception)
+        assert not os.path.exists(os.path.join(local_dir, ModelFile.README))
+
+        self.api.login(TEST_ACCESS_TOKEN1)  # re-login for delete
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/hub/test_utils.py b/tests/hub/test_utils.py
index 38a74fd4..3d312dc0 100644
--- a/tests/hub/test_utils.py
+++ b/tests/hub/test_utils.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 import shutil
 from codecs import ignore_errors
diff --git a/tests/msdatasets/test_dataset_upload.py b/tests/msdatasets/test_dataset_upload.py
index 61b1c6a4..1179414d 100644
--- a/tests/msdatasets/test_dataset_upload.py
+++ b/tests/msdatasets/test_dataset_upload.py
@@ -87,7 +87,6 @@ class DatasetUploadTest(unittest.TestCase):
 
         MsDataset.upload_meta(
             dataset_work_dir=self.test_meta_dir,
-            dataset_id=os.path.join(self.namespace, self.dataset_name),
             commit_message='Update for unit test.')
 
 
diff --git a/tests/msdatasets/test_ms_dataset.py b/tests/msdatasets/test_ms_dataset.py
index 9780ac4b..762530f4 100644
--- a/tests/msdatasets/test_ms_dataset.py
+++ b/tests/msdatasets/test_ms_dataset.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import unittest
 
 from modelscope.models import Model
diff --git a/tests/pipelines/test_action_detection.py b/tests/pipelines/test_action_detection.py
index c752dc78..ae7e60b1 100644
--- a/tests/pipelines/test_action_detection.py
+++ b/tests/pipelines/test_action_detection.py
@@ -2,21 +2,28 @@
 import unittest
 
 from modelscope.pipelines import pipeline
-from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class ActionDetectionTest(unittest.TestCase):
+class ActionDetectionTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.action_detection
+        self.model_id = 'damo/cv_ResNetC3D_action-detection_detection2d'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run(self):
-        action_detection_pipline = pipeline(
-            Tasks.action_detection,
-            model='damo/cv_ResNetC3D_action-detection_detection2d')
+        action_detection_pipline = pipeline(self.task, model=self.model_id)
         result = action_detection_pipline(
             'data/test/videos/action_detection_test_video.mp4')
         print('action detection results:', result)
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_action_recognition.py b/tests/pipelines/test_action_recognition.py
index e955eb60..b9548630 100644
--- a/tests/pipelines/test_action_recognition.py
+++ b/tests/pipelines/test_action_recognition.py
@@ -1,24 +1,21 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-# !/usr/bin/env python
-import os.path as osp
-import tempfile
 import unittest
 
-from modelscope.fileio import File
 from modelscope.pipelines import pipeline
-from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class ActionRecognitionTest(unittest.TestCase):
+class ActionRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
+        self.task = Tasks.action_recognition
         self.model_id = 'damo/cv_TAdaConv_action-recognition'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_modelhub(self):
-        recognition_pipeline = pipeline(
-            Tasks.action_recognition, model=self.model_id)
+        recognition_pipeline = pipeline(self.task, self.model_id)
         result = recognition_pipeline(
             'data/test/videos/action_recognition_test_video.mp4')
 
@@ -26,12 +23,16 @@ class ActionRecognitionTest(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_modelhub_default_model(self):
-        recognition_pipeline = pipeline(Tasks.action_recognition)
+        recognition_pipeline = pipeline(self.task)
         result = recognition_pipeline(
             'data/test/videos/action_recognition_test_video.mp4')
 
         print(f'recognition output: {result}.')
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_animal_recognition.py b/tests/pipelines/test_animal_recognition.py
index 3a31afed..eb9f92e6 100644
--- a/tests/pipelines/test_animal_recognition.py
+++ b/tests/pipelines/test_animal_recognition.py
@@ -1,20 +1,30 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import unittest
 
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class AnimalRecognitionTest(unittest.TestCase):
+class AnimalRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.animal_recognition
+        self.model_id = 'damo/cv_resnest101_animal_recognition'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run(self):
         animal_recognition = pipeline(
-            Tasks.animal_recognition,
-            model='damo/cv_resnest101_animal_recognition')
+            Tasks.animal_recognition, model=self.model_id)
         result = animal_recognition('data/test/images/dogs.jpg')
         print(result)
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_automatic_speech_recognition.py b/tests/pipelines/test_automatic_speech_recognition.py
index a83f5031..303fb6b9 100644
--- a/tests/pipelines/test_automatic_speech_recognition.py
+++ b/tests/pipelines/test_automatic_speech_recognition.py
@@ -10,24 +10,24 @@ import soundfile
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import ColorCodes, Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.logger import get_logger
 from modelscope.utils.test_utils import download_and_untar, test_level
 
 logger = get_logger()
 
 WAV_FILE = 'data/test/audios/asr_example.wav'
+URL_FILE = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example.wav'
 
 LITTLE_TESTSETS_FILE = 'data_aishell.tar.gz'
 LITTLE_TESTSETS_URL = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/datasets/data_aishell.tar.gz'
 
-AISHELL1_TESTSETS_FILE = 'aishell1.tar.gz'
-AISHELL1_TESTSETS_URL = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/datasets/aishell1.tar.gz'
-
 TFRECORD_TESTSETS_FILE = 'tfrecord.tar.gz'
 TFRECORD_TESTSETS_URL = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/datasets/tfrecord.tar.gz'
 
 
-class AutomaticSpeechRecognitionTest(unittest.TestCase):
+class AutomaticSpeechRecognitionTest(unittest.TestCase,
+                                     DemoCompatibilityCheck):
     action_info = {
         'test_run_with_wav_pytorch': {
             'checking_item': OutputKeys.TEXT,
@@ -45,6 +45,10 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase):
             'checking_item': OutputKeys.TEXT,
             'example': 'wav_example'
         },
+        'test_run_with_url_tf': {
+            'checking_item': OutputKeys.TEXT,
+            'example': 'wav_example'
+        },
         'test_run_with_wav_dataset_pytorch': {
             'checking_item': OutputKeys.TEXT,
             'example': 'dataset_example'
@@ -75,6 +79,7 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase):
         self.am_tf_model_id = 'damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8358-tensorflow1'
         # this temporary workspace dir will store waveform files
         self.workspace = os.path.join(os.getcwd(), '.tmp')
+        self.task = Tasks.auto_speech_recognition
         if not os.path.exists(self.workspace):
             os.mkdir(self.workspace)
 
@@ -132,8 +137,8 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_wav_pytorch(self):
-        '''run with single waveform file
-        '''
+        """run with single waveform file
+        """
 
         logger.info('Run ASR test with waveform file (pytorch)...')
 
@@ -145,8 +150,8 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_pcm_pytorch(self):
-        '''run with wav data
-        '''
+        """run with wav data
+        """
 
         logger.info('Run ASR test with wav data (pytorch)...')
 
@@ -158,8 +163,8 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_wav_tf(self):
-        '''run with single waveform file
-        '''
+        """run with single waveform file
+        """
 
         logger.info('Run ASR test with waveform file (tensorflow)...')
 
@@ -171,8 +176,8 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_pcm_tf(self):
-        '''run with wav data
-        '''
+        """run with wav data
+        """
 
         logger.info('Run ASR test with wav data (tensorflow)...')
 
@@ -182,9 +187,20 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase):
             model_id=self.am_tf_model_id, audio_in=audio, sr=sr)
         self.check_result('test_run_with_pcm_tf', rec_result)
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_url_tf(self):
+        """run with single url file
+        """
+
+        logger.info('Run ASR test with url file (tensorflow)...')
+
+        rec_result = self.run_pipeline(
+            model_id=self.am_tf_model_id, audio_in=URL_FILE)
+        self.check_result('test_run_with_url_tf', rec_result)
+
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_wav_dataset_pytorch(self):
-        '''run with datasets, and audio format is waveform
+        """run with datasets, and audio format is waveform
            datasets directory:
              <dataset_path>
                wav
@@ -199,7 +215,7 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase):
                    ...
                transcript
                  data.text  # hypothesis text
-        '''
+        """
 
         logger.info('Run ASR test with waveform dataset (pytorch)...')
         logger.info('Downloading waveform testsets file ...')
@@ -215,7 +231,7 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_wav_dataset_tf(self):
-        '''run with datasets, and audio format is waveform
+        """run with datasets, and audio format is waveform
            datasets directory:
              <dataset_path>
                wav
@@ -230,7 +246,7 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase):
                    ...
                transcript
                  data.text  # hypothesis text
-        '''
+        """
 
         logger.info('Run ASR test with waveform dataset (tensorflow)...')
         logger.info('Downloading waveform testsets file ...')
@@ -244,6 +260,10 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase):
             model_id=self.am_tf_model_id, audio_in=dataset_path)
         self.check_result('test_run_with_wav_dataset_tf', rec_result)
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_body_2d_keypoints.py b/tests/pipelines/test_body_2d_keypoints.py
index d010adc5..5d90cbf0 100644
--- a/tests/pipelines/test_body_2d_keypoints.py
+++ b/tests/pipelines/test_body_2d_keypoints.py
@@ -2,20 +2,20 @@
 import unittest
 
 import cv2
-import numpy as np
 from PIL import Image
 
-from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.base import Pipeline
 from modelscope.utils.constant import Tasks
 from modelscope.utils.cv.image_utils import draw_keypoints
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class Body2DKeypointsTest(unittest.TestCase):
+class Body2DKeypointsTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
+        self.task = Tasks.body_2d_keypoints
         self.model_id = 'damo/cv_hrnetv2w32_body-2d-keypoints_image'
         self.test_image = 'data/test/images/keypoints_detect/000000438862.jpg'
 
@@ -26,16 +26,18 @@ class Body2DKeypointsTest(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_modelhub_with_image_file(self):
-        body_2d_keypoints = pipeline(
-            Tasks.body_2d_keypoints, model=self.model_id)
+        body_2d_keypoints = pipeline(self.task, model=self.model_id)
         self.pipeline_inference(body_2d_keypoints, self.test_image)
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_modelhub_with_image_input(self):
-        body_2d_keypoints = pipeline(
-            Tasks.body_2d_keypoints, model=self.model_id)
+        body_2d_keypoints = pipeline(self.task, model=self.model_id)
         self.pipeline_inference(body_2d_keypoints, Image.open(self.test_image))
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_body_3d_keypoints.py b/tests/pipelines/test_body_3d_keypoints.py
index 50426414..9dce0d19 100644
--- a/tests/pipelines/test_body_3d_keypoints.py
+++ b/tests/pipelines/test_body_3d_keypoints.py
@@ -1,23 +1,23 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-import pdb
 import unittest
 
 import cv2
 import numpy as np
-from PIL import Image
 
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.base import Pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class Body3DKeypointsTest(unittest.TestCase):
+class Body3DKeypointsTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
         self.model_id = 'damo/cv_canonical_body-3d-keypoints_video'
         self.test_video = 'data/test/videos/Walking.54138969.mp4'
+        self.task = Tasks.body_3d_keypoints
 
     def pipeline_inference(self, pipeline: Pipeline, pipeline_input):
         output = pipeline(pipeline_input)
@@ -44,6 +44,10 @@ class Body3DKeypointsTest(unittest.TestCase):
         body_3d_keypoints = pipeline(Tasks.body_3d_keypoints)
         self.pipeline_inference(body_3d_keypoints, self.test_video)
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_cmdssl_video_embedding.py b/tests/pipelines/test_cmdssl_video_embedding.py
index 694ebf40..68eae385 100644
--- a/tests/pipelines/test_cmdssl_video_embedding.py
+++ b/tests/pipelines/test_cmdssl_video_embedding.py
@@ -4,20 +4,28 @@ import unittest
 
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class CMDSSLVideoEmbeddingTest(unittest.TestCase):
+class CMDSSLVideoEmbeddingTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.video_embedding
+        self.model_id = 'damo/cv_r2p1d_video_embedding'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_modelhub(self):
-        videossl_pipeline = pipeline(
-            Tasks.video_embedding, model='damo/cv_r2p1d_video_embedding')
+        videossl_pipeline = pipeline(task=self.task, model=self.model_id)
         result = videossl_pipeline(
             'data/test/videos/action_recognition_test_video.mp4')
 
         print(f'video embedding output: {result}.')
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_conversational_text_to_sql.py b/tests/pipelines/test_conversational_text_to_sql.py
index 0504cb7c..80c72337 100644
--- a/tests/pipelines/test_conversational_text_to_sql.py
+++ b/tests/pipelines/test_conversational_text_to_sql.py
@@ -1,6 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import unittest
-from typing import List
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
@@ -9,11 +8,17 @@ from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import ConversationalTextToSqlPipeline
 from modelscope.preprocessors import ConversationalTextToSqlPreprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.nlp.nlp_utils import text2sql_tracking_and_print_results
 from modelscope.utils.test_utils import test_level
 
 
-class ConversationalTextToSql(unittest.TestCase):
+class ConversationalTextToSql(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.conversational_text_to_sql
+        self.model_id = 'damo/nlp_star_conversational-text-to-sql'
+
     model_id = 'damo/nlp_star_conversational-text-to-sql'
     test_case = {
         'database_id':
@@ -39,10 +44,7 @@ class ConversationalTextToSql(unittest.TestCase):
         pipelines = [
             ConversationalTextToSqlPipeline(
                 model=model, preprocessor=preprocessor),
-            pipeline(
-                task=Tasks.conversational_text_to_sql,
-                model=model,
-                preprocessor=preprocessor)
+            pipeline(task=self.task, model=model, preprocessor=preprocessor)
         ]
         text2sql_tracking_and_print_results(self.test_case, pipelines)
 
@@ -55,26 +57,24 @@ class ConversationalTextToSql(unittest.TestCase):
         pipelines = [
             ConversationalTextToSqlPipeline(
                 model=model, preprocessor=preprocessor),
-            pipeline(
-                task=Tasks.conversational_text_to_sql,
-                model=model,
-                preprocessor=preprocessor)
+            pipeline(task=self.task, model=model, preprocessor=preprocessor)
         ]
         text2sql_tracking_and_print_results(self.test_case, pipelines)
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_name(self):
-        pipelines = [
-            pipeline(
-                task=Tasks.conversational_text_to_sql, model=self.model_id)
-        ]
+        pipelines = [pipeline(task=self.task, model=self.model_id)]
         text2sql_tracking_and_print_results(self.test_case, pipelines)
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
-        pipelines = [pipeline(task=Tasks.conversational_text_to_sql)]
+        pipelines = [pipeline(task=self.task)]
         text2sql_tracking_and_print_results(self.test_case, pipelines)
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_crowd_counting.py b/tests/pipelines/test_crowd_counting.py
index 99f5ffd2..4e15cfca 100644
--- a/tests/pipelines/test_crowd_counting.py
+++ b/tests/pipelines/test_crowd_counting.py
@@ -8,17 +8,19 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
 from modelscope.utils.cv.image_utils import numpy_to_cv2img
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.logger import get_logger
 from modelscope.utils.test_utils import test_level
 
 logger = get_logger()
 
 
-class CrowdCountingTest(unittest.TestCase):
+class CrowdCountingTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
         self.input_location = 'data/test/images/crowd_counting.jpg'
         self.model_id = 'damo/cv_hrnet_crowd-counting_dcanet'
+        self.task = Tasks.crowd_counting
 
     def save_result(self, result):
         print('scores:', result[OutputKeys.SCORES])
@@ -28,7 +30,7 @@ class CrowdCountingTest(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_crowd_counting(self):
-        crowd_counting = pipeline(Tasks.crowd_counting, model=self.model_id)
+        crowd_counting = pipeline(task=self.task, model=self.model_id)
         result = crowd_counting(self.input_location)
         if result:
             self.save_result(result)
@@ -37,7 +39,7 @@ class CrowdCountingTest(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_crowd_counting_with_image(self):
-        crowd_counting = pipeline(Tasks.crowd_counting, model=self.model_id)
+        crowd_counting = pipeline(task=self.task, model=self.model_id)
         img = Image.open(self.input_location)
         result = crowd_counting(img)
         if result:
@@ -47,13 +49,17 @@ class CrowdCountingTest(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_crowd_counting_with_default_task(self):
-        crowd_counting = pipeline(Tasks.crowd_counting)
+        crowd_counting = pipeline(self.task)
         result = crowd_counting(self.input_location)
         if result:
             self.save_result(result)
         else:
             raise ValueError('process error')
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_csanmt_translation.py b/tests/pipelines/test_csanmt_translation.py
index bb6022ec..f7ec81cd 100644
--- a/tests/pipelines/test_csanmt_translation.py
+++ b/tests/pipelines/test_csanmt_translation.py
@@ -3,31 +3,39 @@ import unittest
 
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class TranslationTest(unittest.TestCase):
+class TranslationTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.translation
+        self.model_id = 'damo/nlp_csanmt_translation_zh2en'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_name_for_zh2en(self):
-        model_id = 'damo/nlp_csanmt_translation_zh2en'
         inputs = '声明补充说，沃伦的同事都深感震惊，并且希望他能够投案自首。'
-        pipeline_ins = pipeline(task=Tasks.translation, model=model_id)
+        pipeline_ins = pipeline(self.task, model=self.model_id)
         print(pipeline_ins(input=inputs))
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_name_for_en2zh(self):
         model_id = 'damo/nlp_csanmt_translation_en2zh'
         inputs = 'Elon Musk, co-founder and chief executive officer of Tesla Motors.'
-        pipeline_ins = pipeline(task=Tasks.translation, model=model_id)
+        pipeline_ins = pipeline(self.task, model=model_id)
         print(pipeline_ins(input=inputs))
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
         inputs = '声明补充说，沃伦的同事都深感震惊，并且希望他能够投案自首。'
-        pipeline_ins = pipeline(task=Tasks.translation)
+        pipeline_ins = pipeline(self.task)
         print(pipeline_ins(input=inputs))
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_dialog_intent_prediction.py b/tests/pipelines/test_dialog_intent_prediction.py
index afd68442..5894297f 100644
--- a/tests/pipelines/test_dialog_intent_prediction.py
+++ b/tests/pipelines/test_dialog_intent_prediction.py
@@ -8,11 +8,16 @@ from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import DialogIntentPredictionPipeline
 from modelscope.preprocessors import DialogIntentPredictionPreprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class DialogIntentPredictionTest(unittest.TestCase):
-    model_id = 'damo/nlp_space_dialog-intent-prediction'
+class DialogIntentPredictionTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.task_oriented_conversation
+        self.model_id = 'damo/nlp_space_dialog-intent-prediction'
+
     test_case = [
         'How do I locate my card?',
         'I still have not received my new card, I ordered over a week ago.'
@@ -61,13 +66,15 @@ class DialogIntentPredictionTest(unittest.TestCase):
     def test_run_with_model_name(self):
         pipelines = [
             pipeline(
-                task=Tasks.task_oriented_conversation,
-                model=self.model_id,
-                model_revision='update')
+                task=self.task, model=self.model_id, model_revision='update')
         ]
         for my_pipeline, item in list(zip(pipelines, self.test_case)):
             print(my_pipeline(item))
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_dialog_modeling.py b/tests/pipelines/test_dialog_modeling.py
index 299af2e9..19d6ed2f 100644
--- a/tests/pipelines/test_dialog_modeling.py
+++ b/tests/pipelines/test_dialog_modeling.py
@@ -10,11 +10,16 @@ from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import DialogModelingPipeline
 from modelscope.preprocessors import DialogModelingPreprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class DialogModelingTest(unittest.TestCase):
-    model_id = 'damo/nlp_space_dialog-modeling'
+class DialogModelingTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.task_oriented_conversation
+        self.model_id = 'damo/nlp_space_dialog-modeling'
+
     test_case = {
         'sng0073': {
             'goal': {
@@ -139,7 +144,7 @@ class DialogModelingTest(unittest.TestCase):
     def test_run_with_model_name(self):
         pipelines = [
             pipeline(
-                task=Tasks.task_oriented_conversation,
+                task=self.task,
                 model=self.model_id,
                 model_revision='task_oriented_conversation')
         ]
@@ -149,11 +154,14 @@ class DialogModelingTest(unittest.TestCase):
     def test_run_with_default_model(self):
         pipelines = [
             pipeline(
-                task=Tasks.task_oriented_conversation,
-                model_revision='task_oriented_conversation')
+                task=self.task, model_revision='task_oriented_conversation')
         ]
         self.generate_and_print_dialog_response(pipelines)
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_dialog_state_tracking.py b/tests/pipelines/test_dialog_state_tracking.py
index 843aade9..81bdd9be 100644
--- a/tests/pipelines/test_dialog_state_tracking.py
+++ b/tests/pipelines/test_dialog_state_tracking.py
@@ -8,12 +8,17 @@ from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import DialogStateTrackingPipeline
 from modelscope.preprocessors import DialogStateTrackingPreprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.nlp.nlp_utils import tracking_and_print_dialog_states
 from modelscope.utils.test_utils import test_level
 
 
-class DialogStateTrackingTest(unittest.TestCase):
-    model_id = 'damo/nlp_space_dialog-state-tracking'
+class DialogStateTrackingTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.task_oriented_conversation
+        self.model_id = 'damo/nlp_space_dialog-state-tracking'
+
     test_case = [{
         'User-1':
         'Hi, I\'m looking for a train that is going to cambridge and arriving there by 20:45, '
@@ -103,10 +108,7 @@ class DialogStateTrackingTest(unittest.TestCase):
         pipelines = [
             DialogStateTrackingPipeline(
                 model=model, preprocessor=preprocessor),
-            pipeline(
-                task=Tasks.task_oriented_conversation,
-                model=model,
-                preprocessor=preprocessor)
+            pipeline(task=self.task, model=model, preprocessor=preprocessor)
         ]
 
         tracking_and_print_dialog_states(self.test_case, pipelines)
@@ -115,12 +117,14 @@ class DialogStateTrackingTest(unittest.TestCase):
     def test_run_with_model_name(self):
         pipelines = [
             pipeline(
-                task=Tasks.task_oriented_conversation,
-                model=self.model_id,
-                model_revision='update')
+                task=self.task, model=self.model_id, model_revision='update')
         ]
         tracking_and_print_dialog_states(self.test_case, pipelines)
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_document_segmentation.py b/tests/pipelines/test_document_segmentation.py
index 39609be8..b4406fef 100644
--- a/tests/pipelines/test_document_segmentation.py
+++ b/tests/pipelines/test_document_segmentation.py
@@ -6,13 +6,18 @@ from typing import Any, Dict
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.logger import get_logger
 from modelscope.utils.test_utils import test_level
 
 logger = get_logger()
 
 
-class DocumentSegmentationTest(unittest.TestCase):
+class DocumentSegmentationTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.document_segmentation
+        self.model_id = 'damo/nlp_bert_document-segmentation_chinese-base'
 
     model_id = 'damo/nlp_bert_document-segmentation_chinese-base'
     eng_model_id = 'damo/nlp_bert_document-segmentation_english-base'
@@ -21,10 +26,8 @@ class DocumentSegmentationTest(unittest.TestCase):
     eng_sentences = 'The Saint Alexander Nevsky Church was established in 1936 by Archbishop Vitaly (Maximenko) () on a tract of land donated by Yulia Martinovna Plavskaya.The initial chapel, dedicated to the memory of the great prince St. Alexander Nevsky (1220–1263), was blessed in May, 1936.The church building was subsequently expanded three times.In 1987, ground was cleared for the construction of the new church and on September 12, 1989, on the Feast Day of St. Alexander Nevsky, the cornerstone was laid and the relics of St. Herman of Alaska placed in the foundation.The imposing edifice, completed in 1997, is the work of Nikolaus Karsanov, architect and Protopresbyter Valery Lukianov, engineer.Funds were raised through donations.The Great blessing of the cathedral took place on October 18, 1997 with seven bishops, headed by Metropolitan Vitaly Ustinov, and 36 priests and deacons officiating, some 800 faithful attended the festivity.The old church was rededicated to Our Lady of Tikhvin.Metropolitan Hilarion (Kapral) announced, that cathedral will officially become the episcopal See of the Ruling Bishop of the Eastern American Diocese and the administrative center of the Diocese on September 12, 2014.At present the parish serves the spiritual needs of 300 members.The parochial school instructs over 90 boys and girls in religion, Russian language and history.The school meets every Saturday.The choir is directed by Andrew Burbelo.The sisterhood attends to the needs of the church and a church council acts in the administration of the community.The cathedral is decorated by frescoes in the Byzantine style.The iconography project was fulfilled by Father Andrew Erastov and his students from 1995 until 2001.'  # noqa *
 
     def run_pipeline(self, model_id: str, documents: str) -> Dict[str, Any]:
-        p = pipeline(task=Tasks.document_segmentation, model=model_id)
-
+        p = pipeline(task=self.task, model=model_id)
         result = p(documents=documents)
-
         return result
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
@@ -51,6 +54,10 @@ class DocumentSegmentationTest(unittest.TestCase):
         for document in documents_list:
             print(document)
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_face_detection.py b/tests/pipelines/test_face_detection.py
index 03dd75a6..f89e9a94 100644
--- a/tests/pipelines/test_face_detection.py
+++ b/tests/pipelines/test_face_detection.py
@@ -3,19 +3,19 @@ import os.path as osp
 import unittest
 
 import cv2
-import numpy as np
 
 from modelscope.msdatasets import MsDataset
-from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
 from modelscope.utils.cv.image_utils import draw_face_detection_result
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class FaceDetectionTest(unittest.TestCase):
+class FaceDetectionTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
+        self.task = Tasks.face_detection
         self.model_id = 'damo/cv_resnet_facedetection_scrfd10gkps'
 
     def show_result(self, img_path, detection_result):
@@ -49,6 +49,10 @@ class FaceDetectionTest(unittest.TestCase):
         result = face_detection(img_path)
         self.show_result(img_path, result)
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_face_image_generation.py b/tests/pipelines/test_face_image_generation.py
index c758ea3a..21d8e835 100644
--- a/tests/pipelines/test_face_image_generation.py
+++ b/tests/pipelines/test_face_image_generation.py
@@ -8,12 +8,14 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.base import Pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class FaceGenerationTest(unittest.TestCase):
+class FaceGenerationTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
+        self.task = Tasks.face_image_generation
         self.model_id = 'damo/cv_gan_face-image-generation'
 
     def pipeline_inference(self, pipeline: Pipeline, seed: int):
@@ -26,7 +28,7 @@ class FaceGenerationTest(unittest.TestCase):
     def test_run_modelhub(self):
         seed = 10
         face_generation = pipeline(
-            Tasks.face_image_generation,
+            self.task,
             model=self.model_id,
         )
         self.pipeline_inference(face_generation, seed)
@@ -34,9 +36,13 @@ class FaceGenerationTest(unittest.TestCase):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_modelhub_default_model(self):
         seed = 10
-        face_generation = pipeline(Tasks.face_image_generation)
+        face_generation = pipeline(self.task)
         self.pipeline_inference(face_generation, seed)
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_face_recognition.py b/tests/pipelines/test_face_recognition.py
index 015205d6..d3451f5d 100644
--- a/tests/pipelines/test_face_recognition.py
+++ b/tests/pipelines/test_face_recognition.py
@@ -6,12 +6,14 @@ import numpy as np
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class FaceRecognitionTest(unittest.TestCase):
+class FaceRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
+        self.task = Tasks.face_recognition
         self.model_id = 'damo/cv_ir101_facerecognition_cfglint'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
@@ -26,6 +28,10 @@ class FaceRecognitionTest(unittest.TestCase):
         sim = np.dot(emb1[0], emb2[0])
         print(f'Cos similarity={sim:.3f}, img1:{img1}  img2:{img2}')
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_faq_question_answering.py b/tests/pipelines/test_faq_question_answering.py
index 3a87643c..7eea0ddf 100644
--- a/tests/pipelines/test_faq_question_answering.py
+++ b/tests/pipelines/test_faq_question_answering.py
@@ -11,11 +11,16 @@ from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import FaqQuestionAnsweringPipeline
 from modelscope.preprocessors import FaqQuestionAnsweringPreprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class FaqQuestionAnsweringTest(unittest.TestCase):
-    model_id = 'damo/nlp_structbert_faq-question-answering_chinese-base'
+class FaqQuestionAnsweringTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.faq_question_answering
+        self.model_id = 'damo/nlp_structbert_faq-question-answering_chinese-base'
+
     param = {
         'query_set': ['如何使用优惠券', '在哪里领券', '在哪里领券'],
         'support_set': [{
@@ -80,6 +85,10 @@ class FaqQuestionAnsweringTest(unittest.TestCase):
             ['今天星期六', '明天星期几明天星期几'])
         print(np.shape(sentence_vec))
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_fill_mask.py b/tests/pipelines/test_fill_mask.py
index 6b37f6df..cec8966f 100644
--- a/tests/pipelines/test_fill_mask.py
+++ b/tests/pipelines/test_fill_mask.py
@@ -9,11 +9,17 @@ from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import FillMaskPipeline
 from modelscope.preprocessors import FillMaskPreprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.regress_test_utils import MsRegressTool
 from modelscope.utils.test_utils import test_level
 
 
-class FillMaskTest(unittest.TestCase):
+class FillMaskTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.fill_mask
+        self.model_id = 'damo/nlp_veco_fill-mask-large'
+
     model_id_sbert = {
         'zh': 'damo/nlp_structbert_fill-mask_chinese-large',
         'en': 'damo/nlp_structbert_fill-mask_english-large'
@@ -134,6 +140,10 @@ class FillMaskTest(unittest.TestCase):
         print(f'\nori_text: {ori_text}\ninput: {test_input}\npipeline: '
               f'{pipeline_ins(test_input)}\n')
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_fill_mask_ponet.py b/tests/pipelines/test_fill_mask_ponet.py
new file mode 100644
index 00000000..707cc201
--- /dev/null
+++ b/tests/pipelines/test_fill_mask_ponet.py
@@ -0,0 +1,48 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.metainfo import Pipelines
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class FillMaskPonetTest(unittest.TestCase):
+    model_id_ponet = {
+        'zh': 'damo/nlp_ponet_fill-mask_chinese-base',
+        'en': 'damo/nlp_ponet_fill-mask_english-base'
+    }
+
+    ori_texts = {
+        'zh':
+        '段誉轻挥折扇，摇了摇头，说道：“你师父是你的师父，你师父可不是我的师父。'
+        '你师父差得动你，你师父可差不动我。',
+        'en':
+        'Everything in what you call reality is really just a reflection of your '
+        'consciousness. Your whole universe is just a mirror reflection of your story.'
+    }
+
+    test_inputs = {
+        'zh':
+        '段誉轻[MASK]折扇，摇了摇[MASK]，[MASK]道：“你师父是你的[MASK][MASK]，你'
+        '师父可不是[MASK]的师父。你师父差得动你，你师父可[MASK]不动我。',
+        'en':
+        'Everything in [MASK] you call reality is really [MASK] a reflection of your '
+        '[MASK]. Your [MASK] universe is just a mirror [MASK] of your story.'
+    }
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_ponet_model(self):
+        for language in ['zh', 'en']:
+            ori_text = self.ori_texts[language]
+            test_input = self.test_inputs[language]
+
+            pipeline_ins = pipeline(
+                task=Tasks.fill_mask, model=self.model_id_ponet[language])
+
+            print(f'\nori_text: {ori_text}\ninput: {test_input}\npipeline: '
+                  f'{pipeline_ins(test_input)}\n')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_general_image_classification.py b/tests/pipelines/test_general_image_classification.py
index 8a814f4a..d5357f02 100644
--- a/tests/pipelines/test_general_image_classification.py
+++ b/tests/pipelines/test_general_image_classification.py
@@ -1,11 +1,19 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import unittest
 
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class GeneralImageClassificationTest(unittest.TestCase):
+class GeneralImageClassificationTest(unittest.TestCase,
+                                     DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.image_classification
+        self.model_id = 'damo/cv_vit-base_image-classification_Dailylife-labels'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_ImageNet(self):
@@ -29,6 +37,10 @@ class GeneralImageClassificationTest(unittest.TestCase):
         result = general_image_classification('data/test/images/bird.JPEG')
         print(result)
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_general_recognition.py b/tests/pipelines/test_general_recognition.py
index 0b32e1f5..ba713bbe 100644
--- a/tests/pipelines/test_general_recognition.py
+++ b/tests/pipelines/test_general_recognition.py
@@ -1,11 +1,18 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import unittest
 
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class GeneralRecognitionTest(unittest.TestCase):
+class GeneralRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.general_recognition
+        self.model_id = 'damo/cv_resnest101_general_recognition'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run(self):
@@ -15,6 +22,10 @@ class GeneralRecognitionTest(unittest.TestCase):
         result = general_recognition('data/test/images/dogs.jpg')
         print(result)
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_generative_multi_modal_embedding.py b/tests/pipelines/test_generative_multi_modal_embedding.py
index d8593abb..9232ebd4 100644
--- a/tests/pipelines/test_generative_multi_modal_embedding.py
+++ b/tests/pipelines/test_generative_multi_modal_embedding.py
@@ -5,11 +5,16 @@ import unittest
 from modelscope.models import Model
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class GEMMMultiModalEmbeddingTest(unittest.TestCase):
-    model_id = 'damo/multi-modal_gemm-vit-large-patch14_generative-multi-modal-embedding'
+class GEMMMultiModalEmbeddingTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.generative_multi_modal_embedding
+        self.model_id = 'damo/multi-modal_gemm-vit-large-patch14_generative-multi-modal-embedding'
+
     test_input = {
         'image': 'data/test/images/generative_multimodal.jpg',
         'text':
@@ -63,6 +68,10 @@ class GEMMMultiModalEmbeddingTest(unittest.TestCase):
         output = generative_multi_modal_embedding_pipeline(test_input)
         print(output)
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_hicossl_video_embedding.py b/tests/pipelines/test_hicossl_video_embedding.py
index 5615cef2..8a7de1fa 100644
--- a/tests/pipelines/test_hicossl_video_embedding.py
+++ b/tests/pipelines/test_hicossl_video_embedding.py
@@ -4,12 +4,14 @@ import unittest
 
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class HICOSSLVideoEmbeddingTest(unittest.TestCase):
+class HICOSSLVideoEmbeddingTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
+        self.task = Tasks.video_embedding
         self.model_id = 'damo/cv_s3dg_video-embedding'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
@@ -21,6 +23,10 @@ class HICOSSLVideoEmbeddingTest(unittest.TestCase):
 
         print(f'video embedding output: {result}.')
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_image_color_enhance.py b/tests/pipelines/test_image_color_enhance.py
index c8ea5f9c..9b72999e 100644
--- a/tests/pipelines/test_image_color_enhance.py
+++ b/tests/pipelines/test_image_color_enhance.py
@@ -8,13 +8,15 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.base import Pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class ImageColorEnhanceTest(unittest.TestCase):
+class ImageColorEnhanceTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
         self.model_id = 'damo/cv_csrnet_image-color-enhance-models'
+        self.task = Tasks.image_color_enhancement
 
     def pipeline_inference(self, pipeline: Pipeline, input_location: str):
         result = pipeline(input_location)
@@ -36,6 +38,10 @@ class ImageColorEnhanceTest(unittest.TestCase):
         self.pipeline_inference(img_color_enhance,
                                 'data/test/images/image_color_enhance.png')
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_image_colorization.py b/tests/pipelines/test_image_colorization.py
index 1a02cffb..547fce89 100644
--- a/tests/pipelines/test_image_colorization.py
+++ b/tests/pipelines/test_image_colorization.py
@@ -8,14 +8,16 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.base import Pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class ImageColorizationTest(unittest.TestCase):
+class ImageColorizationTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
         self.model_id = 'damo/cv_unet_image-colorization'
         self.test_image = 'data/test/images/marilyn_monroe_4.jpg'
+        self.task = Tasks.image_colorization
 
     def pipeline_inference(self, pipeline: Pipeline, test_image: str):
         result = pipeline(test_image)
@@ -35,6 +37,10 @@ class ImageColorizationTest(unittest.TestCase):
         image_colorization = pipeline(Tasks.image_colorization)
         self.pipeline_inference(image_colorization, self.test_image)
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_image_denoise.py b/tests/pipelines/test_image_denoise.py
index d3e0af24..bf8cfd0f 100644
--- a/tests/pipelines/test_image_denoise.py
+++ b/tests/pipelines/test_image_denoise.py
@@ -10,11 +10,16 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.cv import ImageDenoisePipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class ImageDenoiseTest(unittest.TestCase):
-    model_id = 'damo/cv_nafnet_image-denoise_sidd'
+class ImageDenoiseTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.image_denoising
+        self.model_id = 'damo/cv_nafnet_image-denoise_sidd'
+
     demo_image_path = 'data/test/images/noisy-demo-1.png'
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
@@ -56,6 +61,10 @@ class ImageDenoiseTest(unittest.TestCase):
         w, h = denoise_img.size
         print('pipeline: the shape of output_img is {}x{}'.format(h, w))
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_image_instance_segmentation.py b/tests/pipelines/test_image_instance_segmentation.py
index cd08d669..2ba0724a 100644
--- a/tests/pipelines/test_image_instance_segmentation.py
+++ b/tests/pipelines/test_image_instance_segmentation.py
@@ -12,11 +12,16 @@ from modelscope.pipelines.cv import ImageInstanceSegmentationPipeline
 from modelscope.preprocessors import build_preprocessor
 from modelscope.utils.config import Config
 from modelscope.utils.constant import Fields, ModelFile, Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class ImageInstanceSegmentationTest(unittest.TestCase):
-    model_id = 'damo/cv_swin-b_image-instance-segmentation_coco'
+class ImageInstanceSegmentationTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.image_segmentation
+        self.model_id = 'damo/cv_swin-b_image-instance-segmentation_coco'
+
     image = 'data/test/images/image_instance_segmentation.jpg'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
@@ -56,6 +61,10 @@ class ImageInstanceSegmentationTest(unittest.TestCase):
         print(f'pipeline1:{pipeline1(input=self.image)[OutputKeys.LABELS]}')
         print(f'pipeline2: {pipeline2(input=self.image)[OutputKeys.LABELS]}')
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_image_matting.py b/tests/pipelines/test_image_matting.py
index 83b7fee2..a3edb705 100644
--- a/tests/pipelines/test_image_matting.py
+++ b/tests/pipelines/test_image_matting.py
@@ -1,19 +1,18 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
-import tempfile
 import unittest
 
 import cv2
 
-from modelscope.fileio import File
 from modelscope.msdatasets import MsDataset
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class ImageMattingTest(unittest.TestCase):
+class ImageMattingTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
         self.model_id = 'damo/cv_unet_image-matting'
@@ -62,6 +61,10 @@ class ImageMattingTest(unittest.TestCase):
             f'Output written to dir: {osp.dirname(osp.abspath("result_0.png"))}'
         )
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_image_panoptic_segmentation.py b/tests/pipelines/test_image_panoptic_segmentation.py
index 3f07adf5..4f12e6af 100644
--- a/tests/pipelines/test_image_panoptic_segmentation.py
+++ b/tests/pipelines/test_image_panoptic_segmentation.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import unittest
 
 import cv2
@@ -7,16 +9,20 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
 from modelscope.utils.cv.image_utils import panoptic_seg_masks_to_image
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class ImagePanopticSegmentationTest(unittest.TestCase):
+class ImagePanopticSegmentationTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.image_segmentation
+        self.model_id = 'damo/cv_swinL_panoptic-segmentation_cocopan'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_image_panoptic_segmentation(self):
         input_location = 'data/test/images/image_panoptic_segmentation.jpg'
-        model_id = 'damo/cv_swinL_panoptic-segmentation_cocopan'
-        pan_segmentor = pipeline(Tasks.image_segmentation, model=model_id)
+        pan_segmentor = pipeline(Tasks.image_segmentation, model=self.model_id)
         result = pan_segmentor(input_location)
 
         draw_img = panoptic_seg_masks_to_image(result[OutputKeys.MASKS])
@@ -26,8 +32,7 @@ class ImagePanopticSegmentationTest(unittest.TestCase):
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_image_panoptic_segmentation_from_PIL(self):
         input_location = 'data/test/images/image_panoptic_segmentation.jpg'
-        model_id = 'damo/cv_swinL_panoptic-segmentation_cocopan'
-        pan_segmentor = pipeline(Tasks.image_segmentation, model=model_id)
+        pan_segmentor = pipeline(Tasks.image_segmentation, model=self.model_id)
         PIL_array = PIL.Image.open(input_location)
         result = pan_segmentor(PIL_array)
 
@@ -35,6 +40,10 @@ class ImagePanopticSegmentationTest(unittest.TestCase):
         cv2.imwrite('result.jpg', draw_img)
         print('print test_image_panoptic_segmentation from PIL return success')
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_image_portrait_enhancement.py b/tests/pipelines/test_image_portrait_enhancement.py
index 834fcfdb..1ca97253 100644
--- a/tests/pipelines/test_image_portrait_enhancement.py
+++ b/tests/pipelines/test_image_portrait_enhancement.py
@@ -9,12 +9,14 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.base import Pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class ImagePortraitEnhancementTest(unittest.TestCase):
+class ImagePortraitEnhancementTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
+        self.task = Tasks.image_portrait_enhancement
         self.model_id = 'damo/cv_gpen_image-portrait-enhancement'
         self.test_image = 'data/test/images/Solvay_conference_1927.png'
 
@@ -37,6 +39,10 @@ class ImagePortraitEnhancementTest(unittest.TestCase):
         face_enhancement = pipeline(Tasks.image_portrait_enhancement)
         self.pipeline_inference(face_enhancement, self.test_image)
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_image_reid_person.py b/tests/pipelines/test_image_reid_person.py
index c3e8d487..310cdd66 100644
--- a/tests/pipelines/test_image_reid_person.py
+++ b/tests/pipelines/test_image_reid_person.py
@@ -6,14 +6,16 @@ from PIL import Image
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class ImageReidPersonTest(unittest.TestCase):
+class ImageReidPersonTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
         self.input_location = 'data/test/images/image_reid_person.jpg'
         self.model_id = 'damo/cv_passvitb_image-reid-person_market'
+        self.task = Tasks.image_reid_person
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_image_reid_person(self):
@@ -48,6 +50,10 @@ class ImageReidPersonTest(unittest.TestCase):
         )
         print(f'The img embedding is: {result[OutputKeys.IMG_EMBEDDING]}')
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_image_semantic_segmentation.py b/tests/pipelines/test_image_semantic_segmentation.py
index 6738976c..286d317a 100644
--- a/tests/pipelines/test_image_semantic_segmentation.py
+++ b/tests/pipelines/test_image_semantic_segmentation.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import unittest
 
 import cv2
@@ -7,17 +9,20 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
 from modelscope.utils.cv.image_utils import semantic_seg_masks_to_image
-from modelscope.utils.logger import get_logger
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class ImageSemanticSegmentationTest(unittest.TestCase):
+class ImageSemanticSegmentationTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = 'image-segmentation'
+        self.model_id = 'damo/cv_swinL_semantic-segmentation_cocopanmerge'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_image_semantic_segmentation_panmerge(self):
         input_location = 'data/test/images/image_semantic_segmentation.jpg'
-        model_id = 'damo/cv_swinL_semantic-segmentation_cocopanmerge'
-        segmenter = pipeline(Tasks.image_segmentation, model=model_id)
+        segmenter = pipeline(Tasks.image_segmentation, model=self.model_id)
         result = segmenter(input_location)
 
         draw_img = semantic_seg_masks_to_image(result[OutputKeys.MASKS])
@@ -34,8 +39,7 @@ class ImageSemanticSegmentationTest(unittest.TestCase):
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_image_semantic_segmentation_vitadapter(self):
         input_location = 'data/test/images/image_semantic_segmentation.jpg'
-        model_id = 'damo/cv_vitadapter_semantic-segmentation_cocostuff164k'
-        segmenter = pipeline(Tasks.image_segmentation, model=model_id)
+        segmenter = pipeline(Tasks.image_segmentation, model=self.model_id)
         result = segmenter(input_location)
 
         draw_img = semantic_seg_masks_to_image(result[OutputKeys.MASKS])
@@ -49,6 +53,10 @@ class ImageSemanticSegmentationTest(unittest.TestCase):
         cv2.imwrite('result.jpg', draw_img)
         print('test_image_semantic_segmentation_vitadapter_from_PIL DONE')
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_image_style_transfer.py b/tests/pipelines/test_image_style_transfer.py
index 4e5bb69b..a02d5308 100644
--- a/tests/pipelines/test_image_style_transfer.py
+++ b/tests/pipelines/test_image_style_transfer.py
@@ -7,12 +7,14 @@ from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class ImageStyleTransferTest(unittest.TestCase):
+class ImageStyleTransferTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
+        self.task = Tasks.image_style_transfer
         self.model_id = 'damo/cv_aams_style-transfer_damo'
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
@@ -48,6 +50,10 @@ class ImageStyleTransferTest(unittest.TestCase):
         cv2.imwrite('result_styletransfer3.png', result[OutputKeys.OUTPUT_IMG])
         print('style_transfer.test_run_modelhub_default_model done')
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_image_super_resolution.py b/tests/pipelines/test_image_super_resolution.py
index 8cf9e46f..d5cbebe8 100644
--- a/tests/pipelines/test_image_super_resolution.py
+++ b/tests/pipelines/test_image_super_resolution.py
@@ -8,14 +8,16 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.base import Pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class ImageSuperResolutionTest(unittest.TestCase):
+class ImageSuperResolutionTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
         self.model_id = 'damo/cv_rrdb_image-super-resolution'
         self.img = 'data/test/images/dogs.jpg'
+        self.task = Tasks.image_super_resolution
 
     def pipeline_inference(self, pipeline: Pipeline, img: str):
         result = pipeline(img)
@@ -35,6 +37,10 @@ class ImageSuperResolutionTest(unittest.TestCase):
         super_resolution = pipeline(Tasks.image_super_resolution)
         self.pipeline_inference(super_resolution, self.img)
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_key_word_spotting.py b/tests/pipelines/test_key_word_spotting.py
index 32a853af..91f9f566 100644
--- a/tests/pipelines/test_key_word_spotting.py
+++ b/tests/pipelines/test_key_word_spotting.py
@@ -10,6 +10,7 @@ import soundfile
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import ColorCodes, Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.logger import get_logger
 from modelscope.utils.test_utils import download_and_untar, test_level
 
@@ -17,6 +18,7 @@ logger = get_logger()
 
 POS_WAV_FILE = 'data/test/audios/kws_xiaoyunxiaoyun.wav'
 BOFANGYINYUE_WAV_FILE = 'data/test/audios/kws_bofangyinyue.wav'
+URL_FILE = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/KWS/pos_testset/20200707_xiaoyun.wav'
 
 POS_TESTSETS_FILE = 'pos_testsets.tar.gz'
 POS_TESTSETS_URL = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/KWS/pos_testsets.tar.gz'
@@ -25,7 +27,7 @@ NEG_TESTSETS_FILE = 'neg_testsets.tar.gz'
 NEG_TESTSETS_URL = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/KWS/neg_testsets.tar.gz'
 
 
-class KeyWordSpottingTest(unittest.TestCase):
+class KeyWordSpottingTest(unittest.TestCase, DemoCompatibilityCheck):
     action_info = {
         'test_run_with_wav': {
             'checking_item': [OutputKeys.KWS_LIST, 0, 'keyword'],
@@ -75,6 +77,22 @@ class KeyWordSpottingTest(unittest.TestCase):
                 }]
             }
         },
+        'test_run_with_url': {
+            'checking_item': [OutputKeys.KWS_LIST, 0, 'keyword'],
+            'checking_value': '小云小云',
+            'example': {
+                'wav_count':
+                1,
+                'kws_type':
+                'pcm',
+                'kws_list': [{
+                    'keyword': '小云小云',
+                    'offset': 0.69,
+                    'length': 1.67,
+                    'confidence': 0.996023
+                }]
+            }
+        },
         'test_run_with_pos_testsets': {
             'checking_item': ['recall'],
             'example': {
@@ -236,6 +254,12 @@ class KeyWordSpottingTest(unittest.TestCase):
         self.check_result('test_run_with_wav_by_customized_keywords',
                           kws_result)
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_url(self):
+        kws_result = self.run_pipeline(
+            model_id=self.model_id, audio_in=URL_FILE)
+        self.check_result('test_run_with_url', kws_result)
+
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_pos_testsets(self):
         wav_file_path = download_and_untar(
@@ -272,6 +296,10 @@ class KeyWordSpottingTest(unittest.TestCase):
             model_id=self.model_id, audio_in=audio_list)
         self.check_result('test_run_with_roc', kws_result)
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_key_word_spotting_farfield.py b/tests/pipelines/test_key_word_spotting_farfield.py
index 4a732950..fea7afd7 100644
--- a/tests/pipelines/test_key_word_spotting_farfield.py
+++ b/tests/pipelines/test_key_word_spotting_farfield.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os.path
 import unittest
 
@@ -6,6 +8,9 @@ from modelscope.utils.constant import Tasks
 from modelscope.utils.test_utils import test_level
 
 TEST_SPEECH_FILE = 'data/test/audios/3ch_nihaomiya.wav'
+TEST_SPEECH_URL = 'https://modelscope.cn/api/v1/models/damo/' \
+                  'speech_dfsmn_kws_char_farfield_16k_nihaomiya/repo' \
+                  '?Revision=master&FilePath=examples/3ch_nihaomiya.wav'
 
 
 class KWSFarfieldTest(unittest.TestCase):
@@ -13,7 +18,7 @@ class KWSFarfieldTest(unittest.TestCase):
     def setUp(self) -> None:
         self.model_id = 'damo/speech_dfsmn_kws_char_farfield_16k_nihaomiya'
 
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_normal(self):
         kws = pipeline(Tasks.keyword_spotting, model=self.model_id)
         inputs = {'input_file': os.path.join(os.getcwd(), TEST_SPEECH_FILE)}
@@ -21,6 +26,13 @@ class KWSFarfieldTest(unittest.TestCase):
         self.assertEqual(len(result['kws_list']), 5)
         print(result['kws_list'][-1])
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_url(self):
+        kws = pipeline(Tasks.keyword_spotting, model=self.model_id)
+        result = kws(TEST_SPEECH_URL)
+        self.assertEqual(len(result['kws_list']), 5)
+        print(result['kws_list'][-1])
+
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_output(self):
         kws = pipeline(Tasks.keyword_spotting, model=self.model_id)
diff --git a/tests/pipelines/test_live_category.py b/tests/pipelines/test_live_category.py
index dead376d..391ed283 100644
--- a/tests/pipelines/test_live_category.py
+++ b/tests/pipelines/test_live_category.py
@@ -3,20 +3,28 @@ import unittest
 
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class LiveCategoryTest(unittest.TestCase):
+class LiveCategoryTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.live_category
+        self.model_id = 'damo/cv_resnet50_live-category'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_modelhub(self):
-        category_pipeline = pipeline(
-            Tasks.live_category, model='damo/cv_resnet50_live-category')
+        category_pipeline = pipeline(Tasks.live_category, self.model_id)
         result = category_pipeline(
             'data/test/videos/live_category_test_video.mp4')
 
         print(f'live category output: {result}.')
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_mog_face_detection.py b/tests/pipelines/test_mog_face_detection.py
new file mode 100644
index 00000000..5c6d97c2
--- /dev/null
+++ b/tests/pipelines/test_mog_face_detection.py
@@ -0,0 +1,33 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+import unittest
+
+import cv2
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.cv.image_utils import draw_face_detection_no_lm_result
+from modelscope.utils.test_utils import test_level
+
+
+class MogFaceDetectionTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.model_id = 'damo/cv_resnet101_face-detection_cvpr22papermogface'
+
+    def show_result(self, img_path, detection_result):
+        img = draw_face_detection_no_lm_result(img_path, detection_result)
+        cv2.imwrite('result.png', img)
+        print(f'output written to {osp.abspath("result.png")}')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub(self):
+        face_detection = pipeline(Tasks.face_detection, model=self.model_id)
+        img_path = 'data/test/images/mog_face_detection.jpg'
+
+        result = face_detection(img_path)
+        self.show_result(img_path, result)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_movie_scene_segmentation.py b/tests/pipelines/test_movie_scene_segmentation.py
index 5993c634..affd5140 100644
--- a/tests/pipelines/test_movie_scene_segmentation.py
+++ b/tests/pipelines/test_movie_scene_segmentation.py
@@ -3,17 +3,21 @@ import unittest
 
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class MovieSceneSegmentationTest(unittest.TestCase):
+class MovieSceneSegmentationTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.movie_scene_segmentation
+        self.model_id = 'damo/cv_resnet50-bert_video-scene-segmentation_movienet'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_movie_scene_segmentation(self):
         input_location = 'data/test/videos/movie_scene_segmentation_test_video.mp4'
-        model_id = 'damo/cv_resnet50-bert_video-scene-segmentation_movienet'
         movie_scene_segmentation_pipeline = pipeline(
-            Tasks.movie_scene_segmentation, model=model_id)
+            Tasks.movie_scene_segmentation, model=self.model_id)
         result = movie_scene_segmentation_pipeline(input_location)
         if result:
             print(result)
@@ -31,6 +35,10 @@ class MovieSceneSegmentationTest(unittest.TestCase):
         else:
             raise ValueError('process error')
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_mplug_tasks.py b/tests/pipelines/test_mplug_tasks.py
index 642ac11d..273d3105 100644
--- a/tests/pipelines/test_mplug_tasks.py
+++ b/tests/pipelines/test_mplug_tasks.py
@@ -7,10 +7,15 @@ from modelscope.models import Model
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class MplugTasksTest(unittest.TestCase):
+class MplugTasksTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = 'visual-question-answering'
+        self.model_id = 'damo/mplug_visual-question-answering_coco_large_en'
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_image_captioning_with_model(self):
@@ -75,6 +80,10 @@ class MplugTasksTest(unittest.TestCase):
         result = pipeline_retrieval(input)
         print(result)
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_mtcnn_face_detection.py b/tests/pipelines/test_mtcnn_face_detection.py
new file mode 100644
index 00000000..5afb5588
--- /dev/null
+++ b/tests/pipelines/test_mtcnn_face_detection.py
@@ -0,0 +1,38 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+import unittest
+
+import cv2
+from PIL import Image
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.cv.image_utils import draw_face_detection_result
+from modelscope.utils.test_utils import test_level
+
+
+class MtcnnFaceDetectionTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.model_id = 'damo/cv_manual_face-detection_mtcnn'
+
+    def show_result(self, img_path, detection_result):
+        img = draw_face_detection_result(img_path, detection_result)
+        cv2.imwrite('result.png', img)
+        print(f'output written to {osp.abspath("result.png")}')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub(self):
+        face_detection = pipeline(Tasks.face_detection, model=self.model_id)
+        img_path = 'data/test/images/mtcnn_face_detection.jpg'
+        img = Image.open(img_path)
+
+        result_1 = face_detection(img_path)
+        self.show_result(img_path, result_1)
+
+        result_2 = face_detection(img)
+        self.show_result(img_path, result_2)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_multi_modal_embedding.py b/tests/pipelines/test_multi_modal_embedding.py
index f94e31fa..23954c27 100644
--- a/tests/pipelines/test_multi_modal_embedding.py
+++ b/tests/pipelines/test_multi_modal_embedding.py
@@ -8,11 +8,16 @@ from modelscope.models import Model
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class MultiModalEmbeddingTest(unittest.TestCase):
-    model_id = 'damo/multi-modal_clip-vit-base-patch16_zh'
+class MultiModalEmbeddingTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.multi_modal_embedding
+        self.model_id = 'damo/multi-modal_clip-vit-base-patch16_zh'
+
     test_input = {'text': '皮卡丘'}
     model_version = 'dev'
 
@@ -54,6 +59,10 @@ class MultiModalEmbeddingTest(unittest.TestCase):
         print('l2-norm: {}'.format(torch.norm(text_embedding,
                                               dim=-1).item()))  # should be 1.0
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_multi_modal_similarity.py b/tests/pipelines/test_multi_modal_similarity.py
index d1d6a7a8..192602b4 100644
--- a/tests/pipelines/test_multi_modal_similarity.py
+++ b/tests/pipelines/test_multi_modal_similarity.py
@@ -10,32 +10,38 @@ from modelscope.utils.test_utils import test_level
 
 class MultiModalSimilarityTest(unittest.TestCase):
     model_id = 'damo/multi-modal_team-vit-large-patch14_multi-modal-similarity'
-    test_input = {
-        'img': 'data/test/images/generative_multimodal.jpg',
-        'text': '起居室照片'
-    }
+    test_img = 'data/test/images/multimodal_similarity.jpg'
+    test_str1 = '一个上了年纪的女人在城镇中骑着自行车一个黄色出租车正要从她身边驶过'
+    test_str2 = '穿着蓝色连衣裙的那个女人正冲着行来的车辆伸出她的手'
+
+    def infer_pipeline(self, multi_modal_similarity_pipeline):
+        test_input1 = {'img': self.test_img, 'text': self.test_str1}
+        test_input2 = {'img': self.test_img, 'text': self.test_str2}
+        output1 = multi_modal_similarity_pipeline(test_input1)
+        output2 = multi_modal_similarity_pipeline(test_input2)
+        print('image: {}, text: {}, similarity: {}'.format(
+            self.test_img, self.test_str1, output1['scores']))
+        print('image: {}, text: {}, similarity: {}'.format(
+            self.test_img, self.test_str2, output2['scores']))
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run(self):
         multi_modal_similarity_pipeline = pipeline(
             Tasks.multi_modal_similarity, model=self.model_id)
-        output = multi_modal_similarity_pipeline(self.test_input)
-        print(output)
+        self.infer_pipeline(multi_modal_similarity_pipeline)
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
         multi_modal_similarity_pipeline = pipeline(
             task=Tasks.multi_modal_similarity)
-        output = multi_modal_similarity_pipeline(self.test_input)
-        print(output)
+        self.infer_pipeline(multi_modal_similarity_pipeline)
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
         model = Model.from_pretrained(self.model_id)
         multi_modal_similarity_pipeline = pipeline(
             task=Tasks.multi_modal_similarity, model=model)
-        output = multi_modal_similarity_pipeline(self.test_input)
-        print(output)
+        self.infer_pipeline(multi_modal_similarity_pipeline)
 
 
 if __name__ == '__main__':
diff --git a/tests/pipelines/test_multi_stage_diffusion.py b/tests/pipelines/test_multi_stage_diffusion.py
new file mode 100644
index 00000000..f4e63ce0
--- /dev/null
+++ b/tests/pipelines/test_multi_stage_diffusion.py
@@ -0,0 +1,40 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import unittest
+
+import numpy as np
+import torch
+
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class MultiStageDiffusionTest(unittest.TestCase):
+    model_id = 'damo/cv_diffusion_text-to-image-synthesis'
+    test_text = {'text': 'Photograph of a baby chicken wearing sunglasses'}
+
+    @unittest.skip(
+        'skip test since the pretrained model is not publicly available')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id)
+        pipe_line_text_to_image_synthesis = pipeline(
+            task=Tasks.text_to_image_synthesis, model=model)
+        img = pipe_line_text_to_image_synthesis(
+            self.test_text)[OutputKeys.OUTPUT_IMG]
+        print(np.sum(np.abs(img)))
+
+    @unittest.skip(
+        'skip test since the pretrained model is not publicly available')
+    def test_run_with_model_name(self):
+        pipe_line_text_to_image_synthesis = pipeline(
+            task=Tasks.text_to_image_synthesis, model=self.model_id)
+        img = pipe_line_text_to_image_synthesis(
+            self.test_text)[OutputKeys.OUTPUT_IMG]
+        print(np.sum(np.abs(img)))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_named_entity_recognition.py b/tests/pipelines/test_named_entity_recognition.py
index ad0fa228..9fae2d09 100644
--- a/tests/pipelines/test_named_entity_recognition.py
+++ b/tests/pipelines/test_named_entity_recognition.py
@@ -9,10 +9,16 @@ from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import NamedEntityRecognitionPipeline
 from modelscope.preprocessors import NERPreprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class NamedEntityRecognitionTest(unittest.TestCase):
+class NamedEntityRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.named_entity_recognition
+        self.model_id = 'damo/nlp_raner_named-entity-recognition_chinese-base-news'
+
     tcrf_model_id = 'damo/nlp_raner_named-entity-recognition_chinese-base-news'
     lcrf_model_id = 'damo/nlp_lstm_named-entity-recognition_chinese-news'
     sentence = '这与温岭市新河镇的一个神秘的传说有关。'
@@ -88,6 +94,10 @@ class NamedEntityRecognitionTest(unittest.TestCase):
         pipeline_ins = pipeline(task=Tasks.named_entity_recognition)
         print(pipeline_ins(input=self.sentence))
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_nli.py b/tests/pipelines/test_nli.py
index 1d3fba12..a53ac3b3 100644
--- a/tests/pipelines/test_nli.py
+++ b/tests/pipelines/test_nli.py
@@ -8,12 +8,17 @@ from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import PairSentenceClassificationPipeline
 from modelscope.preprocessors import PairSentenceClassificationPreprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.regress_test_utils import MsRegressTool
 from modelscope.utils.test_utils import test_level
 
 
-class NLITest(unittest.TestCase):
-    model_id = 'damo/nlp_structbert_nli_chinese-base'
+class NLITest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.nli
+        self.model_id = 'damo/nlp_structbert_nli_chinese-base'
+
     sentence1 = '四川商务职业学院和四川财经职业学院哪个好？'
     sentence2 = '四川商务职业学院商务管理在哪个校区？'
     regress_tool = MsRegressTool(baseline=False)
@@ -52,6 +57,10 @@ class NLITest(unittest.TestCase):
         pipeline_ins = pipeline(task=Tasks.nli)
         print(pipeline_ins(input=(self.sentence1, self.sentence2)))
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_object_detection.py b/tests/pipelines/test_object_detection.py
index de16aaa1..2a74eb41 100644
--- a/tests/pipelines/test_object_detection.py
+++ b/tests/pipelines/test_object_detection.py
@@ -3,10 +3,15 @@ import unittest
 
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class ObjectDetectionTest(unittest.TestCase):
+class ObjectDetectionTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.human_detection
+        self.model_id = 'damo/cv_resnet18_human-detection'
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_object_detection(self):
@@ -50,6 +55,10 @@ class ObjectDetectionTest(unittest.TestCase):
         else:
             raise ValueError('process error')
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_ocr_detection.py b/tests/pipelines/test_ocr_detection.py
index a4201512..e0591496 100644
--- a/tests/pipelines/test_ocr_detection.py
+++ b/tests/pipelines/test_ocr_detection.py
@@ -4,14 +4,16 @@ import unittest
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.base import Pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class OCRDetectionTest(unittest.TestCase):
+class OCRDetectionTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
         self.model_id = 'damo/cv_resnet18_ocr-detection-line-level_damo'
         self.test_image = 'data/test/images/ocr_detection.jpg'
+        self.task = Tasks.ocr_detection
 
     def pipeline_inference(self, pipeline: Pipeline, input_location: str):
         result = pipeline(input_location)
@@ -28,6 +30,10 @@ class OCRDetectionTest(unittest.TestCase):
         ocr_detection = pipeline(Tasks.ocr_detection)
         self.pipeline_inference(ocr_detection, self.test_image)
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_ocr_recognition.py b/tests/pipelines/test_ocr_recognition.py
index a2e5ba8e..8d48dd7a 100644
--- a/tests/pipelines/test_ocr_recognition.py
+++ b/tests/pipelines/test_ocr_recognition.py
@@ -1,26 +1,21 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-import os.path as osp
-import shutil
-import sys
-import tempfile
 import unittest
-from typing import Any, Dict, List, Tuple, Union
 
-import cv2
-import numpy as np
 import PIL
 
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.base import Pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class OCRRecognitionTest(unittest.TestCase):
+class OCRRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
         self.model_id = 'damo/cv_convnextTiny_ocr-recognition-general_damo'
         self.test_image = 'data/test/images/ocr_recognition.jpg'
+        self.task = Tasks.ocr_recognition
 
     def pipeline_inference(self, pipeline: Pipeline, input_location: str):
         result = pipeline(input_location)
@@ -42,6 +37,10 @@ class OCRRecognitionTest(unittest.TestCase):
         ocr_recognition = pipeline(Tasks.ocr_recognition)
         self.pipeline_inference(ocr_recognition, self.test_image)
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_ofa_tasks.py b/tests/pipelines/test_ofa_tasks.py
index 8779ba48..9a72d1ff 100644
--- a/tests/pipelines/test_ofa_tasks.py
+++ b/tests/pipelines/test_ofa_tasks.py
@@ -11,10 +11,11 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
 from modelscope.utils.cv.image_utils import created_boxed_image
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class OfaTasksTest(unittest.TestCase):
+class OfaTasksTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
         self.output_dir = 'unittest_output'
@@ -169,7 +170,6 @@ class OfaTasksTest(unittest.TestCase):
         ofa_pipe = pipeline(Tasks.visual_grounding, model=model)
         image = 'data/test/images/visual_grounding.png'
         text = '一个圆头的蓝色宝可梦'
-        text = '火'
         input = {'image': image, 'text': text}
         result = ofa_pipe(input)
         print(result)
@@ -252,26 +252,9 @@ class OfaTasksTest(unittest.TestCase):
         result[OutputKeys.OUTPUT_IMG].save('result.png')
         print(f'Output written to {osp.abspath("result.png")}')
 
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
-    def test_run_with_visual_question_answering_huge_with_name(self):
-        model = '/apsarapangu/disk2/yichang.zyc/ckpt/MaaS/ofa_visual-question-answering_pretrain_huge_en'
-        ofa_pipe = pipeline(Tasks.visual_question_answering, model=model)
-        image = 'data/test/images/visual_question_answering.png'
-        text = 'what is grown on the plant?'
-        input = {'image': image, 'text': text}
-        result = ofa_pipe(input)
-        print(result)
-
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
-    def test_run_with_image_captioning_huge_with_name(self):
-        model = '/apsarapangu/disk2/yichang.zyc/ckpt/MaaS/ofa_image-caption_coco_huge_en'
-        img_captioning = pipeline(
-            task=Tasks.image_captioning,
-            model=model,
-        )
-        result = img_captioning(
-            {'image': 'data/test/images/image_captioning.png'})
-        print(result[OutputKeys.CAPTION])
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
 
 
 if __name__ == '__main__':
diff --git a/tests/pipelines/test_part_of_speech.py b/tests/pipelines/test_part_of_speech.py
new file mode 100644
index 00000000..25f4491c
--- /dev/null
+++ b/tests/pipelines/test_part_of_speech.py
@@ -0,0 +1,55 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import shutil
+import unittest
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.models.nlp import TokenClassificationModel
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.nlp import TokenClassificationPipeline
+from modelscope.preprocessors import TokenClassificationPreprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class PartOfSpeechTest(unittest.TestCase):
+    model_id = 'damo/nlp_structbert_part-of-speech_chinese-base'
+    sentence = '今天天气不错，适合出去游玩'
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_by_direct_model_download(self):
+        cache_path = snapshot_download(self.model_id)
+        tokenizer = TokenClassificationPreprocessor(cache_path)
+        model = TokenClassificationModel.from_pretrained(cache_path)
+        pipeline1 = TokenClassificationPipeline(model, preprocessor=tokenizer)
+        pipeline2 = pipeline(
+            Tasks.token_classification, model=model, preprocessor=tokenizer)
+        print(f'sentence: {self.sentence}\n'
+              f'pipeline1:{pipeline1(input=self.sentence)}')
+        print()
+        print(f'pipeline2: {pipeline2(input=self.sentence)}')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id)
+        tokenizer = TokenClassificationPreprocessor(model.model_dir)
+        pipeline_ins = pipeline(
+            task=Tasks.token_classification,
+            model=model,
+            preprocessor=tokenizer)
+        print(pipeline_ins(input=self.sentence))
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_model_name(self):
+        pipeline_ins = pipeline(
+            task=Tasks.token_classification, model=self.model_id)
+        print(pipeline_ins(input=self.sentence))
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_default_model(self):
+        pipeline_ins = pipeline(task=Tasks.token_classification)
+        print(pipeline_ins(input=self.sentence))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_passage_ranking.py b/tests/pipelines/test_passage_ranking.py
new file mode 100644
index 00000000..5faa365e
--- /dev/null
+++ b/tests/pipelines/test_passage_ranking.py
@@ -0,0 +1,61 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import shutil
+import unittest
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.models.nlp import PassageRanking
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.nlp import PassageRankingPipeline
+from modelscope.preprocessors import PassageRankingPreprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class PassageRankingTest(unittest.TestCase):
+    model_id = 'damo/nlp_corom_passage-ranking_english-base'
+    inputs = {
+        'source_sentence': ["how long it take to get a master's degree"],
+        'sentences_to_compare': [
+            "On average, students take about 18 to 24 months to complete a master's degree.",
+            'On the other hand, some students prefer to go at a slower pace and choose to take '
+            'several years to complete their studies.',
+            'It can take anywhere from two semesters'
+        ]
+    }
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_by_direct_model_download(self):
+        cache_path = snapshot_download(self.model_id)
+        tokenizer = PassageRankingPreprocessor(cache_path)
+        model = PassageRanking.from_pretrained(cache_path)
+        pipeline1 = PassageRankingPipeline(model, preprocessor=tokenizer)
+        pipeline2 = pipeline(
+            Tasks.passage_ranking, model=model, preprocessor=tokenizer)
+        print(f'sentence: {self.inputs}\n'
+              f'pipeline1:{pipeline1(input=self.inputs)}')
+        print()
+        print(f'pipeline2: {pipeline2(input=self.inputs)}')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id)
+        tokenizer = PassageRankingPreprocessor(model.model_dir)
+        pipeline_ins = pipeline(
+            task=Tasks.passage_ranking, model=model, preprocessor=tokenizer)
+        print(pipeline_ins(input=self.inputs))
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_model_name(self):
+        pipeline_ins = pipeline(
+            task=Tasks.passage_ranking, model=self.model_id)
+        print(pipeline_ins(input=self.inputs))
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_default_model(self):
+        pipeline_ins = pipeline(task=Tasks.passage_ranking)
+        print(pipeline_ins(input=self.inputs))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_person_image_cartoon.py b/tests/pipelines/test_person_image_cartoon.py
index bdbf8b61..b8549f4f 100644
--- a/tests/pipelines/test_person_image_cartoon.py
+++ b/tests/pipelines/test_person_image_cartoon.py
@@ -1,5 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-import os
 import os.path as osp
 import unittest
 
@@ -9,13 +8,19 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.base import Pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class ImageCartoonTest(unittest.TestCase):
+class ImageCartoonTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
         self.model_id = 'damo/cv_unet_person-image-cartoon_compound-models'
+        self.model_id_3d = 'damo/cv_unet_person-image-cartoon-3d_compound-models'
+        self.model_id_handdrawn = 'damo/cv_unet_person-image-cartoon-handdrawn_compound-models'
+        self.model_id_sketch = 'damo/cv_unet_person-image-cartoon-sketch_compound-models'
+        self.model_id_artstyle = 'damo/cv_unet_person-image-cartoon-artstyle_compound-models'
+        self.task = Tasks.image_portrait_stylization
         self.test_image = 'https://modelscope.oss-cn-beijing.aliyuncs.com/test/images/image_cartoon.png'
 
     def pipeline_inference(self, pipeline: Pipeline, input_location: str):
@@ -30,11 +35,39 @@ class ImageCartoonTest(unittest.TestCase):
             Tasks.image_portrait_stylization, model=self.model_id)
         self.pipeline_inference(img_cartoon, self.test_image)
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub_3d(self):
+        img_cartoon = pipeline(
+            Tasks.image_portrait_stylization, model=self.model_id_3d)
+        self.pipeline_inference(img_cartoon, self.test_image)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub_handdrawn(self):
+        img_cartoon = pipeline(
+            Tasks.image_portrait_stylization, model=self.model_id_handdrawn)
+        self.pipeline_inference(img_cartoon, self.test_image)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub_sketch(self):
+        img_cartoon = pipeline(
+            Tasks.image_portrait_stylization, model=self.model_id_sketch)
+        self.pipeline_inference(img_cartoon, self.test_image)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub_artstyle(self):
+        img_cartoon = pipeline(
+            Tasks.image_portrait_stylization, model=self.model_id_artstyle)
+        self.pipeline_inference(img_cartoon, self.test_image)
+
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_modelhub_default_model(self):
         img_cartoon = pipeline(Tasks.image_portrait_stylization)
         self.pipeline_inference(img_cartoon, self.test_image)
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_plug_text_generation.py b/tests/pipelines/test_plug_text_generation.py
new file mode 100644
index 00000000..90b48efa
--- /dev/null
+++ b/tests/pipelines/test_plug_text_generation.py
@@ -0,0 +1,49 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+
+class TextPlugGenerationTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        # please make sure this local path exists.
+        self.model_id = 'damo/nlp_plug_text-generation_27B'
+        self.model_dir = snapshot_download(self.model_id)
+        self.plug_input = '段誉轻挥折扇，摇了摇头，说道：“你师父是你的师父，你师父可不是我的师父。"'
+
+    @unittest.skip('distributed plug, skipped')
+    def test_plug(self):
+        """ The model can be downloaded from the link on
+        https://modelscope.cn/models/damo/nlp_plug_text-generation_27B/summary.
+        After downloading, you should have a plug model structure like this:
+        nlp_plug_text-generation_27B
+            |_ config.json
+            |_ configuration.json
+            |_ ds_zero-offload_10B_config.json
+            |_ vocab.txt
+            |_ model <-- an empty directory
+
+        Model binaries shall be downloaded separately to populate the model directory, so that
+        the model directory would contain the following binaries:
+            |_ model
+                |_ mp_rank_00_model_states.pt
+                |_ mp_rank_01_model_states.pt
+                |_ mp_rank_02_model_states.pt
+                |_ mp_rank_03_model_states.pt
+                |_ mp_rank_04_model_states.pt
+                |_ mp_rank_05_model_states.pt
+                |_ mp_rank_06_model_states.pt
+                |_ mp_rank_07_model_states.pt
+        """
+        # download model binaries to <model_dir>/model
+        pipe = pipeline(Tasks.text_generation, model=self.model_id)
+        print(
+            f'input: {self.plug_input}\noutput: {pipe(self.plug_input, out_length=256)}'
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_product_retrieval_embedding.py b/tests/pipelines/test_product_retrieval_embedding.py
index c416943e..2483d53a 100644
--- a/tests/pipelines/test_product_retrieval_embedding.py
+++ b/tests/pipelines/test_product_retrieval_embedding.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import unittest
 
 import numpy as np
@@ -6,11 +8,16 @@ from modelscope.models import Model
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class ProductRetrievalEmbeddingTest(unittest.TestCase):
-    model_id = 'damo/cv_resnet50_product-bag-embedding-models'
+class ProductRetrievalEmbeddingTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.product_retrieval_embedding
+        self.model_id = 'damo/cv_resnet50_product-bag-embedding-models'
+
     img_input = 'data/test/images/product_embed_bag.jpg'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
@@ -34,6 +41,10 @@ class ProductRetrievalEmbeddingTest(unittest.TestCase):
         result = product_embed(self.img_input)[OutputKeys.IMG_EMBEDDING]
         print('abs sum value is: {}'.format(np.sum(np.abs(result))))
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_realtime_object_detection.py b/tests/pipelines/test_realtime_object_detection.py
index 03ddacf4..e04f6b5c 100644
--- a/tests/pipelines/test_realtime_object_detection.py
+++ b/tests/pipelines/test_realtime_object_detection.py
@@ -2,22 +2,22 @@
 import unittest
 
 import cv2
-import numpy as np
 
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
-from modelscope.pipelines.base import Pipeline
 from modelscope.utils.constant import Tasks
 from modelscope.utils.cv.image_utils import realtime_object_detection_bbox_vis
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class RealtimeObjectDetectionTest(unittest.TestCase):
+class RealtimeObjectDetectionTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
         self.model_id = 'damo/cv_cspnet_image-object-detection_yolox'
         self.model_nano_id = 'damo/cv_cspnet_image-object-detection_yolox_nano_coco'
         self.test_image = 'data/test/images/keypoints_detect/000000438862.jpg'
+        self.task = Tasks.image_object_detection
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_modelhub(self):
@@ -47,6 +47,10 @@ class RealtimeObjectDetectionTest(unittest.TestCase):
         else:
             raise ValueError('process error')
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_relation_extraction.py b/tests/pipelines/test_relation_extraction.py
index 20502a19..57d98f66 100644
--- a/tests/pipelines/test_relation_extraction.py
+++ b/tests/pipelines/test_relation_extraction.py
@@ -1,8 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import unittest
 
-import torch
-
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
 from modelscope.models.nlp import InformationExtractionModel
@@ -10,11 +8,16 @@ from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import InformationExtractionPipeline
 from modelscope.preprocessors import RelationExtractionPreprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class RelationExtractionTest(unittest.TestCase):
-    model_id = 'damo/nlp_bert_relation-extraction_chinese-base'
+class RelationExtractionTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.information_extraction
+        self.model_id = 'damo/nlp_bert_relation-extraction_chinese-base'
+
     sentence = '高捷，祖籍江苏，本科毕业于东南大学'
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
@@ -52,6 +55,10 @@ class RelationExtractionTest(unittest.TestCase):
         pipeline_ins = pipeline(task=Tasks.information_extraction)
         print(pipeline_ins(input=self.sentence))
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_salient_detection.py b/tests/pipelines/test_salient_detection.py
index ec010b17..e87e9388 100644
--- a/tests/pipelines/test_salient_detection.py
+++ b/tests/pipelines/test_salient_detection.py
@@ -4,10 +4,15 @@ import unittest
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class SalientDetectionTest(unittest.TestCase):
+class SalientDetectionTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.image_segmentation
+        self.model_id = 'damo/cv_u2net_salient-detection'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_salient_detection(self):
@@ -19,6 +24,10 @@ class SalientDetectionTest(unittest.TestCase):
         # result[OutputKeys.MASKS] is salient map result,other keys are not used
         cv2.imwrite(input_location + '_salient.jpg', result[OutputKeys.MASKS])
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_sentence_embedding.py b/tests/pipelines/test_sentence_embedding.py
new file mode 100644
index 00000000..739dd7ab
--- /dev/null
+++ b/tests/pipelines/test_sentence_embedding.py
@@ -0,0 +1,82 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import shutil
+import unittest
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.models.nlp import SentenceEmbedding
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.nlp import SentenceEmbeddingPipeline
+from modelscope.preprocessors import SentenceEmbeddingPreprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class SentenceEmbeddingTest(unittest.TestCase):
+    model_id = 'damo/nlp_corom_sentence-embedding_english-base'
+    inputs = {
+        'source_sentence': ["how long it take to get a master's degree"],
+        'sentences_to_compare': [
+            "On average, students take about 18 to 24 months to complete a master's degree.",
+            'On the other hand, some students prefer to go at a slower pace and choose to take ',
+            'several years to complete their studies.',
+            'It can take anywhere from two semesters'
+        ]
+    }
+
+    inputs2 = {
+        'source_sentence': ["how long it take to get a master's degree"],
+        'sentences_to_compare': [
+            "On average, students take about 18 to 24 months to complete a master's degree."
+        ]
+    }
+
+    inputs3 = {
+        'source_sentence': ["how long it take to get a master's degree"],
+        'sentences_to_compare': []
+    }
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_by_direct_model_download(self):
+        cache_path = snapshot_download(self.model_id)
+        tokenizer = SentenceEmbeddingPreprocessor(cache_path)
+        model = SentenceEmbedding.from_pretrained(cache_path)
+        pipeline1 = SentenceEmbeddingPipeline(model, preprocessor=tokenizer)
+        pipeline2 = pipeline(
+            Tasks.sentence_embedding, model=model, preprocessor=tokenizer)
+        print(f'inputs: {self.inputs}\n'
+              f'pipeline1:{pipeline1(input=self.inputs)}')
+        print()
+        print(f'pipeline2: {pipeline2(input=self.inputs)}')
+        print()
+        print(f'inputs: {self.inputs2}\n'
+              f'pipeline1:{pipeline1(input=self.inputs2)}')
+        print()
+        print(f'pipeline2: {pipeline2(input=self.inputs2)}')
+        print(f'inputs: {self.inputs3}\n'
+              f'pipeline1:{pipeline1(input=self.inputs3)}')
+        print()
+        print(f'pipeline2: {pipeline2(input=self.inputs3)}')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id)
+        tokenizer = SentenceEmbeddingPreprocessor(model.model_dir)
+        pipeline_ins = pipeline(
+            task=Tasks.sentence_embedding, model=model, preprocessor=tokenizer)
+        print(pipeline_ins(input=self.inputs))
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_model_name(self):
+        pipeline_ins = pipeline(
+            task=Tasks.sentence_embedding, model=self.model_id)
+        print(pipeline_ins(input=self.inputs))
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_default_model(self):
+        pipeline_ins = pipeline(task=Tasks.sentence_embedding)
+        print(pipeline_ins(input=self.inputs))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_sentence_similarity.py b/tests/pipelines/test_sentence_similarity.py
index 6990bf75..4079455d 100644
--- a/tests/pipelines/test_sentence_similarity.py
+++ b/tests/pipelines/test_sentence_similarity.py
@@ -8,12 +8,17 @@ from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import PairSentenceClassificationPipeline
 from modelscope.preprocessors import PairSentenceClassificationPreprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.regress_test_utils import MsRegressTool
 from modelscope.utils.test_utils import test_level
 
 
-class SentenceSimilarityTest(unittest.TestCase):
-    model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base'
+class SentenceSimilarityTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.sentence_similarity
+        self.model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base'
+
     sentence1 = '今天气温比昨天高么？'
     sentence2 = '今天湿度比昨天高么？'
     regress_tool = MsRegressTool(baseline=False)
@@ -58,6 +63,10 @@ class SentenceSimilarityTest(unittest.TestCase):
         pipeline_ins = pipeline(task=Tasks.sentence_similarity)
         print(pipeline_ins(input=(self.sentence1, self.sentence2)))
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_sentiment_classification.py b/tests/pipelines/test_sentiment_classification.py
index 35c96282..3db9971a 100644
--- a/tests/pipelines/test_sentiment_classification.py
+++ b/tests/pipelines/test_sentiment_classification.py
@@ -9,11 +9,17 @@ from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import SingleSentenceClassificationPipeline
 from modelscope.preprocessors import SingleSentenceClassificationPreprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class SentimentClassificationTaskModelTest(unittest.TestCase):
-    model_id = 'damo/nlp_structbert_sentiment-classification_chinese-base'
+class SentimentClassificationTaskModelTest(unittest.TestCase,
+                                           DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.sentiment_classification
+        self.model_id = 'damo/nlp_structbert_sentiment-classification_chinese-base'
+
     sentence1 = '启动的时候很大声音，然后就会听到1.2秒的卡察的声音，类似齿轮摩擦的声音'
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
@@ -60,6 +66,10 @@ class SentimentClassificationTaskModelTest(unittest.TestCase):
         self.assertTrue(
             isinstance(pipeline_ins.model, SequenceClassificationModel))
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_skin_retouching.py b/tests/pipelines/test_skin_retouching.py
index c6dbee2c..db8d89ed 100644
--- a/tests/pipelines/test_skin_retouching.py
+++ b/tests/pipelines/test_skin_retouching.py
@@ -9,12 +9,14 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.base import Pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class SkinRetouchingTest(unittest.TestCase):
+class SkinRetouchingTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
+        self.task = Tasks.skin_retouching
         self.model_id = 'damo/cv_unet_skin-retouching'
         self.test_image = 'data/test/images/skin_retouching.png'
 
@@ -39,6 +41,10 @@ class SkinRetouchingTest(unittest.TestCase):
         skin_retouching = pipeline(Tasks.skin_retouching)
         self.pipeline_inference(skin_retouching, self.test_image)
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_speech_signal_process.py b/tests/pipelines/test_speech_signal_process.py
index 007e6c73..e5f97c02 100644
--- a/tests/pipelines/test_speech_signal_process.py
+++ b/tests/pipelines/test_speech_signal_process.py
@@ -1,25 +1,35 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os.path
-import shutil
 import unittest
 
-from modelscope.fileio import File
 from modelscope.metainfo import Pipelines
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 NEAREND_MIC_FILE = 'data/test/audios/nearend_mic.wav'
 FAREND_SPEECH_FILE = 'data/test/audios/farend_speech.wav'
+NEAREND_MIC_URL = 'https://modelscope.cn/api/v1/models/damo/' \
+                  'speech_dfsmn_aec_psm_16k/repo?Revision=master' \
+                  '&FilePath=examples/nearend_mic.wav'
+FAREND_SPEECH_URL = 'https://modelscope.cn/api/v1/models/damo/' \
+                    'speech_dfsmn_aec_psm_16k/repo?Revision=master' \
+                    '&FilePath=examples/farend_speech.wav'
 
 NOISE_SPEECH_FILE = 'data/test/audios/speech_with_noise.wav'
+NOISE_SPEECH_URL = 'https://modelscope.cn/api/v1/models/damo/' \
+                   'speech_frcrn_ans_cirm_16k/repo?Revision=master' \
+                   '&FilePath=examples/speech_with_noise.wav'
 
 
-class SpeechSignalProcessTest(unittest.TestCase):
+class SpeechSignalProcessTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
         pass
 
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_aec(self):
         model_id = 'damo/speech_dfsmn_aec_psm_16k'
         input = {
@@ -31,6 +41,18 @@ class SpeechSignalProcessTest(unittest.TestCase):
         aec(input, output_path=output_path)
         print(f'Processed audio saved to {output_path}')
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_aec_url(self):
+        model_id = 'damo/speech_dfsmn_aec_psm_16k'
+        input = {
+            'nearend_mic': NEAREND_MIC_URL,
+            'farend_speech': FAREND_SPEECH_URL
+        }
+        aec = pipeline(Tasks.acoustic_echo_cancellation, model=model_id)
+        output_path = os.path.abspath('output.wav')
+        aec(input, output_path=output_path)
+        print(f'Processed audio saved to {output_path}')
+
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_aec_bytes(self):
         model_id = 'damo/speech_dfsmn_aec_psm_16k'
@@ -63,7 +85,7 @@ class SpeechSignalProcessTest(unittest.TestCase):
         aec(inputs, output_path=output_path)
         print(f'Processed audio saved to {output_path}')
 
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_ans(self):
         model_id = 'damo/speech_frcrn_ans_cirm_16k'
         ans = pipeline(Tasks.acoustic_noise_suppression, model=model_id)
@@ -72,6 +94,14 @@ class SpeechSignalProcessTest(unittest.TestCase):
             output_path=output_path)
         print(f'Processed audio saved to {output_path}')
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_ans_url(self):
+        model_id = 'damo/speech_frcrn_ans_cirm_16k'
+        ans = pipeline(Tasks.acoustic_noise_suppression, model=model_id)
+        output_path = os.path.abspath('output.wav')
+        ans(NOISE_SPEECH_URL, output_path=output_path)
+        print(f'Processed audio saved to {output_path}')
+
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_ans_bytes(self):
         model_id = 'damo/speech_frcrn_ans_cirm_16k'
@@ -85,6 +115,10 @@ class SpeechSignalProcessTest(unittest.TestCase):
             ans(data, output_path=output_path)
         print(f'Processed audio saved to {output_path}')
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_table_question_answering.py b/tests/pipelines/test_table_question_answering.py
new file mode 100644
index 00000000..7ea28725
--- /dev/null
+++ b/tests/pipelines/test_table_question_answering.py
@@ -0,0 +1,75 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import unittest
+
+from transformers import BertTokenizer
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.nlp import TableQuestionAnsweringPipeline
+from modelscope.preprocessors import TableQuestionAnsweringPreprocessor
+from modelscope.preprocessors.star3.fields.database import Database
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.nlp.nlp_utils import tableqa_tracking_and_print_results
+from modelscope.utils.test_utils import test_level
+
+
+class TableQuestionAnswering(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.task = Tasks.table_question_answering
+        self.model_id = 'damo/nlp_convai_text2sql_pretrain_cn'
+
+    model_id = 'damo/nlp_convai_text2sql_pretrain_cn'
+    test_case = {
+        'utterance':
+        ['长江流域的小(2)型水库的库容总量是多少？', '那平均值是多少？', '那水库的名称呢？', '换成中型的呢？']
+    }
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_by_direct_model_download(self):
+        cache_path = snapshot_download(self.model_id)
+        preprocessor = TableQuestionAnsweringPreprocessor(model_dir=cache_path)
+        pipelines = [
+            TableQuestionAnsweringPipeline(
+                model=cache_path, preprocessor=preprocessor)
+        ]
+        tableqa_tracking_and_print_results(self.test_case, pipelines)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id)
+        preprocessor = TableQuestionAnsweringPreprocessor(
+            model_dir=model.model_dir)
+        pipelines = [
+            TableQuestionAnsweringPipeline(
+                model=model, preprocessor=preprocessor)
+        ]
+        tableqa_tracking_and_print_results(self.test_case, pipelines)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_from_task(self):
+        pipelines = [pipeline(Tasks.table_question_answering, self.model_id)]
+        tableqa_tracking_and_print_results(self.test_case, pipelines)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_model_from_modelhub_with_other_classes(self):
+        model = Model.from_pretrained(self.model_id)
+        self.tokenizer = BertTokenizer(
+            os.path.join(model.model_dir, ModelFile.VOCAB_FILE))
+        db = Database(
+            tokenizer=self.tokenizer,
+            table_file_path=os.path.join(model.model_dir, 'table.json'),
+            syn_dict_file_path=os.path.join(model.model_dir, 'synonym.txt'))
+        preprocessor = TableQuestionAnsweringPreprocessor(
+            model_dir=model.model_dir, db=db)
+        pipelines = [
+            TableQuestionAnsweringPipeline(
+                model=model, preprocessor=preprocessor, db=db)
+        ]
+        tableqa_tracking_and_print_results(self.test_case, pipelines)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_text_classification.py b/tests/pipelines/test_text_classification.py
index 542568d1..71b9f3e2 100644
--- a/tests/pipelines/test_text_classification.py
+++ b/tests/pipelines/test_text_classification.py
@@ -6,14 +6,16 @@ from modelscope.msdatasets import MsDataset
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import SequenceClassificationPipeline
 from modelscope.preprocessors import SequenceClassificationPreprocessor
-from modelscope.utils.constant import Hubs, Tasks
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class SequenceClassificationTest(unittest.TestCase):
+class SequenceClassificationTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
         self.model_id = 'damo/bert-base-sst2'
+        self.task = Tasks.text_classification
 
     def predict(self, pipeline_ins: SequenceClassificationPipeline):
         from easynlp.appzoo import load_dataset
@@ -87,6 +89,10 @@ class SequenceClassificationTest(unittest.TestCase):
         result = text_classification(dataset)
         self.printDataset(result)
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_text_driven_segmentation.py b/tests/pipelines/test_text_driven_segmentation.py
index 741787d9..a67729ff 100644
--- a/tests/pipelines/test_text_driven_segmentation.py
+++ b/tests/pipelines/test_text_driven_segmentation.py
@@ -23,6 +23,10 @@ class TextDrivenSegmentationTest(unittest.TestCase):
         # result[OutputKeys.MASKS] is segment map result,other keys are not used
         cv2.imwrite(input_location + '_lseg.jpg', result[OutputKeys.MASKS])
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.test_demo()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_text_error_correction.py b/tests/pipelines/test_text_error_correction.py
index 5a1890ce..a714d3d0 100644
--- a/tests/pipelines/test_text_error_correction.py
+++ b/tests/pipelines/test_text_error_correction.py
@@ -8,11 +8,16 @@ from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import TextErrorCorrectionPipeline
 from modelscope.preprocessors import TextErrorCorrectionPreprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class TextErrorCorrectionTest(unittest.TestCase):
-    model_id = 'damo/nlp_bart_text-error-correction_chinese'
+class TextErrorCorrectionTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.text_error_correction
+        self.model_id = 'damo/nlp_bart_text-error-correction_chinese'
+
     input = '随着中国经济突飞猛近，建造工业与日俱增'
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
@@ -50,6 +55,10 @@ class TextErrorCorrectionTest(unittest.TestCase):
         pipeline_ins = pipeline(task=Tasks.text_error_correction)
         print(pipeline_ins(self.input))
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_text_generation.py b/tests/pipelines/test_text_generation.py
index c08209a4..66f9c9da 100644
--- a/tests/pipelines/test_text_generation.py
+++ b/tests/pipelines/test_text_generation.py
@@ -8,10 +8,11 @@ from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import TextGenerationPipeline
 from modelscope.preprocessors import TextGenerationPreprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class TextGenerationTest(unittest.TestCase):
+class TextGenerationTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
         self.palm_model_id_zh = 'damo/nlp_palm2.0_text-generation_chinese-base'
@@ -128,6 +129,10 @@ class TextGenerationTest(unittest.TestCase):
         pipeline_ins = pipeline(task=Tasks.text_generation)
         print(pipeline_ins(self.palm_input_zh))
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_text_to_image_synthesis.py b/tests/pipelines/test_text_to_image_synthesis.py
index 32778ffb..0da6768a 100644
--- a/tests/pipelines/test_text_to_image_synthesis.py
+++ b/tests/pipelines/test_text_to_image_synthesis.py
@@ -8,11 +8,16 @@ from modelscope.models import Model
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class TextToImageSynthesisTest(unittest.TestCase):
-    model_id = 'damo/cv_diffusion_text-to-image-synthesis_tiny'
+class TextToImageSynthesisTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.text_to_image_synthesis
+        self.model_id = 'damo/cv_diffusion_text-to-image-synthesis_tiny'
+
     test_text = {
         'text': '宇航员',
         'generator_ddim_timesteps': 2,
@@ -46,6 +51,10 @@ class TextToImageSynthesisTest(unittest.TestCase):
             self.test_text)[OutputKeys.OUTPUT_IMG]
         print(np.sum(np.abs(img)))
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_text_to_speech.py b/tests/pipelines/test_text_to_speech.py
index 74cab01f..e82cf43e 100644
--- a/tests/pipelines/test_text_to_speech.py
+++ b/tests/pipelines/test_text_to_speech.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import unittest
 
 # NOTICE: Tensorflow 1.15 seems not so compatible with pytorch.
@@ -10,6 +12,7 @@ from scipy.io.wavfile import write
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.logger import get_logger
 from modelscope.utils.test_utils import test_level
 
@@ -18,22 +21,29 @@ import tensorflow as tf  # isort:skip
 logger = get_logger()
 
 
-class TextToSpeechSambertHifigan16kPipelineTest(unittest.TestCase):
+class TextToSpeechSambertHifigan16kPipelineTest(unittest.TestCase,
+                                                DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.text_to_speech
+        self.model_id = 'damo/speech_sambert-hifigan_tts_zhitian_emo_zh-cn_16k'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_pipeline(self):
         text = '今天北京天气怎么样？'
-        model_id = 'damo/speech_sambert-hifigan_tts_zhitian_emo_zh-cn_16k'
         voice = 'zhitian_emo'
 
-        sambert_hifigan_tts = pipeline(
-            task=Tasks.text_to_speech, model=model_id)
+        sambert_hifigan_tts = pipeline(task=self.task, model=self.model_id)
         self.assertTrue(sambert_hifigan_tts is not None)
         output = sambert_hifigan_tts(input=text, voice=voice)
         self.assertIsNotNone(output[OutputKeys.OUTPUT_PCM])
         pcm = output[OutputKeys.OUTPUT_PCM]
         write('output.wav', 16000, pcm)
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_tinynas_classification.py b/tests/pipelines/test_tinynas_classification.py
index d64b5bc0..ebc6b722 100644
--- a/tests/pipelines/test_tinynas_classification.py
+++ b/tests/pipelines/test_tinynas_classification.py
@@ -1,11 +1,18 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import unittest
 
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class TinyNASClassificationTest(unittest.TestCase):
+class TinyNASClassificationTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.image_classification
+        self.model_id = 'damo/cv_tinynas_classification'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run(self):
@@ -14,6 +21,10 @@ class TinyNASClassificationTest(unittest.TestCase):
         result = tinynas_classification('data/test/images/image_wolf.jpeg')
         print(result)
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_tinynas_detection.py b/tests/pipelines/test_tinynas_detection.py
index 6b2ecd0b..63db9145 100644
--- a/tests/pipelines/test_tinynas_detection.py
+++ b/tests/pipelines/test_tinynas_detection.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import unittest
 
 from modelscope.pipelines import pipeline
@@ -15,6 +17,10 @@ class TinynasObjectDetectionTest(unittest.TestCase):
             'data/test/images/image_detection.jpg')
         print(result)
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.test_demo()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_ulfd_face_detection.py b/tests/pipelines/test_ulfd_face_detection.py
new file mode 100644
index 00000000..0ffa688c
--- /dev/null
+++ b/tests/pipelines/test_ulfd_face_detection.py
@@ -0,0 +1,36 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+import unittest
+
+import cv2
+import numpy as np
+
+from modelscope.msdatasets import MsDataset
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.cv.image_utils import draw_face_detection_no_lm_result
+from modelscope.utils.test_utils import test_level
+
+
+class UlfdFaceDetectionTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.model_id = 'damo/cv_manual_face-detection_ulfd'
+
+    def show_result(self, img_path, detection_result):
+        img = draw_face_detection_no_lm_result(img_path, detection_result)
+        cv2.imwrite('result.png', img)
+        print(f'output written to {osp.abspath("result.png")}')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub(self):
+        face_detection = pipeline(Tasks.face_detection, model=self.model_id)
+        img_path = 'data/test/images/ulfd_face_detection.jpg'
+
+        result = face_detection(img_path)
+        self.show_result(img_path, result)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_video_category.py b/tests/pipelines/test_video_category.py
index aba56676..660196b8 100644
--- a/tests/pipelines/test_video_category.py
+++ b/tests/pipelines/test_video_category.py
@@ -3,20 +3,28 @@ import unittest
 
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class VideoCategoryTest(unittest.TestCase):
+class VideoCategoryTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.video_category
+        self.model_id = 'damo/cv_resnet50_video-category'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_modelhub(self):
-        category_pipeline = pipeline(
-            Tasks.video_category, model='damo/cv_resnet50_video-category')
+        category_pipeline = pipeline(Tasks.video_category, self.model_id)
         result = category_pipeline(
             'data/test/videos/video_category_test_video.mp4')
 
         print(f'video category output: {result}.')
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_video_inpainting.py b/tests/pipelines/test_video_inpainting.py
new file mode 100644
index 00000000..8364b1b3
--- /dev/null
+++ b/tests/pipelines/test_video_inpainting.py
@@ -0,0 +1,39 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.base import Pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class VideoInpaintingTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.model = 'damo/cv_video-inpainting'
+        self.mask_dir = 'data/test/videos/mask_dir'
+        self.video_in = 'data/test/videos/video_inpainting_test.mp4'
+        self.video_out = 'out.mp4'
+        self.input = {
+            'video_input_path': self.video_in,
+            'video_output_path': self.video_out,
+            'mask_path': self.mask_dir
+        }
+
+    def pipeline_inference(self, pipeline: Pipeline, input: str):
+        result = pipeline(input)
+        print(result)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub(self):
+        video_inpainting = pipeline(Tasks.video_inpainting, model=self.model)
+        self.pipeline_inference(video_inpainting, self.input)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_modelhub_default_model(self):
+        video_inpainting = pipeline(Tasks.video_inpainting)
+        self.pipeline_inference(video_inpainting, self.input)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_video_multi_modal_embedding.py b/tests/pipelines/test_video_multi_modal_embedding.py
index b33ba56c..f4aa4d24 100644
--- a/tests/pipelines/test_video_multi_modal_embedding.py
+++ b/tests/pipelines/test_video_multi_modal_embedding.py
@@ -4,15 +4,19 @@ import unittest
 
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.logger import get_logger
 from modelscope.utils.test_utils import test_level
 
 logger = get_logger()
 
 
-class VideoMultiModalEmbeddingTest(unittest.TestCase):
+class VideoMultiModalEmbeddingTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.video_multi_modal_embedding
+        self.model_id = 'damo/multi_modal_clip_vtretrival_msrvtt_53'
 
-    model_id = 'damo/multi_modal_clip_vtretrival_msrvtt_53'
     video_path = 'data/test/videos/multi_modal_test_video_9770.mp4'
     caption = ('a person is connecting something to system', None, None)
     _input = {'video': video_path, 'text': caption}
@@ -37,6 +41,10 @@ class VideoMultiModalEmbeddingTest(unittest.TestCase):
         logger.info('video feature: {}'.format(
             output['video_embedding'][0][0][0]))
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_video_single_object_tracking.py b/tests/pipelines/test_video_single_object_tracking.py
index fc228cd8..7f3a9226 100644
--- a/tests/pipelines/test_video_single_object_tracking.py
+++ b/tests/pipelines/test_video_single_object_tracking.py
@@ -5,12 +5,14 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
 from modelscope.utils.cv.image_utils import show_video_tracking_result
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class SingleObjectTracking(unittest.TestCase):
+class SingleObjectTracking(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
+        self.task = Tasks.video_single_object_tracking
         self.model_id = 'damo/cv_vitb_video-single-object-tracking_ostrack'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
@@ -33,6 +35,10 @@ class SingleObjectTracking(unittest.TestCase):
         result = video_single_object_tracking((video_path, init_bbox))
         print('result is : ', result[OutputKeys.BOXES])
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_video_summarization.py b/tests/pipelines/test_video_summarization.py
index 12a0ee07..6dcc31e9 100644
--- a/tests/pipelines/test_video_summarization.py
+++ b/tests/pipelines/test_video_summarization.py
@@ -4,17 +4,21 @@ import unittest
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
 from modelscope.utils.cv.image_utils import show_video_summarization_result
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class VideoSummarizationTest(unittest.TestCase):
+class VideoSummarizationTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.video_summarization
+        self.model_id = 'damo/cv_googlenet_pgl-video-summarization'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_modelhub(self):
-        model_id = 'damo/cv_googlenet_pgl-video-summarization'
         video_path = 'data/test/videos/video_category_test_video.mp4'
         summarization_pipeline = pipeline(
-            Tasks.video_summarization, model=model_id)
+            Tasks.video_summarization, model=self.model_id)
         result = summarization_pipeline(video_path)
 
         print(f'video summarization output: \n{result}.')
@@ -29,6 +33,10 @@ class VideoSummarizationTest(unittest.TestCase):
 
         print(f'video summarization output:\n {result}.')
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_virtual_try_on.py b/tests/pipelines/test_virtual_try_on.py
index 1979c9b8..5c18dcc4 100644
--- a/tests/pipelines/test_virtual_try_on.py
+++ b/tests/pipelines/test_virtual_try_on.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import unittest
 
 import cv2
@@ -6,11 +8,16 @@ from PIL import Image
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class VirtualTryonTest(unittest.TestCase):
-    model_id = 'damo/cv_daflow_virtual-try-on_base'
+class VirtualTryonTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.virtual_try_on
+        self.model_id = 'damo/cv_daflow_virtual-try-on_base'
+
     masked_model = Image.open('data/test/images/virtual_tryon_model.jpg')
     pose = Image.open('data/test/images/virtual_tryon_pose.jpg')
     cloth = Image.open('data/test/images/virtual_tryon_cloth.jpg')
@@ -29,6 +36,10 @@ class VirtualTryonTest(unittest.TestCase):
         img = pipeline_virtual_tryon(self.input_imgs)[OutputKeys.OUTPUT_IMG]
         cv2.imwrite('demo.jpg', img[:, :, ::-1])
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_word_segmentation.py b/tests/pipelines/test_word_segmentation.py
index 87006f96..cd01b98f 100644
--- a/tests/pipelines/test_word_segmentation.py
+++ b/tests/pipelines/test_word_segmentation.py
@@ -1,5 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-import shutil
 import unittest
 
 from modelscope.hub.snapshot_download import snapshot_download
@@ -9,12 +8,17 @@ from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import WordSegmentationPipeline
 from modelscope.preprocessors import TokenClassificationPreprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.regress_test_utils import MsRegressTool
 from modelscope.utils.test_utils import test_level
 
 
-class WordSegmentationTest(unittest.TestCase):
-    model_id = 'damo/nlp_structbert_word-segmentation_chinese-base'
+class WordSegmentationTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.word_segmentation
+        self.model_id = 'damo/nlp_structbert_word-segmentation_chinese-base'
+
     sentence = '今天天气不错，适合出去游玩'
     sentence_eng = 'I am a program.'
     regress_tool = MsRegressTool(baseline=False)
@@ -55,6 +59,10 @@ class WordSegmentationTest(unittest.TestCase):
         pipeline_ins = pipeline(task=Tasks.word_segmentation)
         print(pipeline_ins(input=self.sentence))
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_zero_shot_classification.py b/tests/pipelines/test_zero_shot_classification.py
index f0f2a481..da1854c9 100644
--- a/tests/pipelines/test_zero_shot_classification.py
+++ b/tests/pipelines/test_zero_shot_classification.py
@@ -8,12 +8,17 @@ from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import ZeroShotClassificationPipeline
 from modelscope.preprocessors import ZeroShotClassificationPreprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.regress_test_utils import MsRegressTool
 from modelscope.utils.test_utils import test_level
 
 
-class ZeroShotClassificationTest(unittest.TestCase):
-    model_id = 'damo/nlp_structbert_zero-shot-classification_chinese-base'
+class ZeroShotClassificationTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.zero_shot_classification
+        self.model_id = 'damo/nlp_structbert_zero-shot-classification_chinese-base'
+
     sentence = '全新突破 解放军运20版空中加油机曝光'
     labels = ['文化', '体育', '娱乐', '财经', '家居', '汽车', '教育', '科技', '军事']
     template = '这篇文章的标题是{}'
@@ -65,6 +70,10 @@ class ZeroShotClassificationTest(unittest.TestCase):
         pipeline_ins = pipeline(task=Tasks.zero_shot_classification)
         print(pipeline_ins(input=self.sentence, candidate_labels=self.labels))
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/run.py b/tests/run.py
index 478cb9d6..b286ecb5 100644
--- a/tests/run.py
+++ b/tests/run.py
@@ -24,7 +24,9 @@ import torch
 import yaml
 
 from modelscope.utils.logger import get_logger
-from modelscope.utils.test_utils import set_test_level, test_level
+from modelscope.utils.model_tag import ModelTag, commit_model_ut_result
+from modelscope.utils.test_utils import (get_case_model_info, set_test_level,
+                                         test_level)
 
 logger = get_logger()
 
@@ -54,17 +56,34 @@ def statistics_test_result(df):
     if failures_cases > 0 or \
        error_cases > 0 or \
        unexpected_success_cases > 0:
-        result = 'FAILED'
+        final_result = 'FAILED'
     else:
-        result = 'SUCCESS'
+        final_result = 'SUCCESS'
     result_msg = '%s (Runs=%s,success=%s,failures=%s,errors=%s,\
     skipped=%s,expected failures=%s,unexpected successes=%s)' % (
-        result, total_cases, success_cases, failures_cases, error_cases,
+        final_result, total_cases, success_cases, failures_cases, error_cases,
         skipped_cases, expected_failure_cases, unexpected_success_cases)
 
+    model_cases = get_case_model_info()
+    for model_name, case_info in model_cases.items():
+        cases = df.loc[df['Name'].str.contains('|'.join(list(case_info)))]
+        results = cases['Result']
+        result = None
+        if any(results == 'Error') or any(results == 'Failures') or any(
+                results == 'UnexpectedSuccesses'):
+            result = ModelTag.MODEL_FAIL
+        elif any(results == 'Success'):
+            result = ModelTag.MODEL_PASS
+        elif all(results == 'Skipped'):
+            result = ModelTag.MODEL_SKIP
+        else:
+            print(f'invalid results for {model_name} \n{result}')
+
+        if result is not None:
+            commit_model_ut_result(model_name, result)
     print('Testing result summary.')
     print(result_msg)
-    if result == 'FAILED':
+    if final_result == 'FAILED':
         sys.exit(1)
 
 
@@ -401,7 +420,7 @@ if __name__ == '__main__':
     parser.add_argument(
         '--suites',
         nargs='*',
-        help='Run specified test suites(test suite file list)')
+        help='Run specified test suites(test suite files list split by space)')
     args = parser.parse_args()
     set_test_level(args.level)
     os.environ['REGRESSION_BASELINE'] = '1'
diff --git a/tests/run_config.yaml b/tests/run_config.yaml
index f44053f6..fc983023 100644
--- a/tests/run_config.yaml
+++ b/tests/run_config.yaml
@@ -6,6 +6,9 @@ isolated:  # test cases that may require excessive anmount of GPU memory, which
   - test_video_summarization.py
   - test_dialog_modeling.py
   - test_csanmt_translation.py
+  - test_image_super_resolution.py
+  - test_easycv_trainer.py
+  - test_segformer.py
 
 envs:
   default: # default env, case not in other env will in default, pytorch.
diff --git a/tests/trainers/audio/test_ans_trainer.py b/tests/trainers/audio/test_ans_trainer.py
index ed8cd1fe..c0860529 100644
--- a/tests/trainers/audio/test_ans_trainer.py
+++ b/tests/trainers/audio/test_ans_trainer.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 import shutil
 import tempfile
diff --git a/tests/trainers/easycv/test_easycv_trainer.py b/tests/trainers/easycv/test_easycv_trainer.py
index 6d1d7ec4..4bd63c55 100644
--- a/tests/trainers/easycv/test_easycv_trainer.py
+++ b/tests/trainers/easycv/test_easycv_trainer.py
@@ -6,10 +6,10 @@ import tempfile
 import unittest
 
 import json
-import requests
 import torch
 
 from modelscope.metainfo import Models, Pipelines, Trainers
+from modelscope.msdatasets import MsDataset
 from modelscope.trainers import build_trainer
 from modelscope.utils.config import Config
 from modelscope.utils.constant import LogKeys, ModeKeys, Tasks
@@ -18,55 +18,19 @@ from modelscope.utils.test_utils import DistributedTestCase, test_level
 from modelscope.utils.torch_utils import is_master
 
 
-def _download_data(url, save_dir):
-    r = requests.get(url, verify=True)
-    if not os.path.exists(save_dir):
-        os.makedirs(save_dir)
-    zip_name = os.path.split(url)[-1]
-    save_path = os.path.join(save_dir, zip_name)
-    with open(save_path, 'wb') as f:
-        f.write(r.content)
-
-    unpack_dir = os.path.join(save_dir, os.path.splitext(zip_name)[0])
-    shutil.unpack_archive(save_path, unpack_dir)
-
-
-def train_func(work_dir, dist=False, log_config=3, imgs_per_gpu=4):
+def train_func(work_dir, dist=False, log_interval=3, imgs_per_gpu=4):
     import easycv
     config_path = os.path.join(
         os.path.dirname(easycv.__file__),
         'configs/detection/yolox/yolox_s_8xb16_300e_coco.py')
 
-    data_dir = os.path.join(work_dir, 'small_coco_test')
-    url = 'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/datasets/small_coco.zip'
-    if is_master():
-        _download_data(url, data_dir)
-
-    import time
-    time.sleep(1)
     cfg = Config.from_file(config_path)
 
-    cfg.work_dir = work_dir
-    cfg.total_epochs = 2
-    cfg.checkpoint_config.interval = 1
-    cfg.eval_config.interval = 1
-    cfg.log_config = dict(
-        interval=log_config,
-        hooks=[
+    cfg.log_config.update(
+        dict(hooks=[
             dict(type='TextLoggerHook'),
             dict(type='TensorboardLoggerHook')
-        ])
-    cfg.data.train.data_source.ann_file = os.path.join(
-        data_dir, 'small_coco/small_coco/instances_train2017_20.json')
-    cfg.data.train.data_source.img_prefix = os.path.join(
-        data_dir, 'small_coco/small_coco/train2017')
-    cfg.data.val.data_source.ann_file = os.path.join(
-        data_dir, 'small_coco/small_coco/instances_val2017_20.json')
-    cfg.data.val.data_source.img_prefix = os.path.join(
-        data_dir, 'small_coco/small_coco/val2017')
-    cfg.data.imgs_per_gpu = imgs_per_gpu
-    cfg.data.workers_per_gpu = 2
-    cfg.data.val.imgs_per_gpu = 2
+        ]))  # not support TensorboardLoggerHookV2
 
     ms_cfg_file = os.path.join(work_dir, 'ms_yolox_s_8xb16_300e_coco.json')
     from easycv.utils.ms_utils import to_ms_config
@@ -81,9 +45,41 @@ def train_func(work_dir, dist=False, log_config=3, imgs_per_gpu=4):
             save_path=ms_cfg_file)
 
     trainer_name = Trainers.easycv
+    train_dataset = MsDataset.load(
+        dataset_name='small_coco_for_test', namespace='EasyCV', split='train')
+    eval_dataset = MsDataset.load(
+        dataset_name='small_coco_for_test',
+        namespace='EasyCV',
+        split='validation')
+
+    cfg_options = {
+        'train.max_epochs':
+        2,
+        'train.dataloader.batch_size_per_gpu':
+        imgs_per_gpu,
+        'evaluation.dataloader.batch_size_per_gpu':
+        2,
+        'train.hooks': [
+            {
+                'type': 'CheckpointHook',
+                'interval': 1
+            },
+            {
+                'type': 'EvaluationHook',
+                'interval': 1
+            },
+            {
+                'type': 'TextLoggerHook',
+                'interval': log_interval
+            },
+        ]
+    }
     kwargs = dict(
-        task=Tasks.image_object_detection,
         cfg_file=ms_cfg_file,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        work_dir=work_dir,
+        cfg_options=cfg_options,
         launcher='pytorch' if dist else None)
 
     trainer = build_trainer(trainer_name, kwargs)
@@ -105,11 +101,8 @@ class EasyCVTrainerTestSingleGpu(unittest.TestCase):
         super().tearDown()
         shutil.rmtree(self.tmp_dir, ignore_errors=True)
 
-    @unittest.skipIf(
-        True, 'The test cases are all run in the master process, '
-        'cause registry conflicts, and it should run in the subprocess.')
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_single_gpu(self):
-        # TODO: run in subprocess
         train_func(self.tmp_dir)
 
         results_files = os.listdir(self.tmp_dir)
@@ -185,7 +178,7 @@ class EasyCVTrainerTestMultiGpus(DistributedTestCase):
             num_gpus=2,
             work_dir=self.tmp_dir,
             dist=True,
-            log_config=2,
+            log_interval=2,
             imgs_per_gpu=5)
 
         results_files = os.listdir(self.tmp_dir)
diff --git a/tests/trainers/easycv/test_segformer.py b/tests/trainers/easycv/test_segformer.py
index 0da47ef6..90a66635 100644
--- a/tests/trainers/easycv/test_segformer.py
+++ b/tests/trainers/easycv/test_segformer.py
@@ -5,28 +5,14 @@ import shutil
 import tempfile
 import unittest
 
-import requests
 import torch
 
 from modelscope.metainfo import Trainers
+from modelscope.msdatasets import MsDataset
 from modelscope.trainers import build_trainer
 from modelscope.utils.constant import LogKeys, Tasks
 from modelscope.utils.logger import get_logger
 from modelscope.utils.test_utils import test_level
-from modelscope.utils.torch_utils import is_master
-
-
-def _download_data(url, save_dir):
-    r = requests.get(url, verify=True)
-    if not os.path.exists(save_dir):
-        os.makedirs(save_dir)
-    zip_name = os.path.split(url)[-1]
-    save_path = os.path.join(save_dir, zip_name)
-    with open(save_path, 'wb') as f:
-        f.write(r.content)
-
-    unpack_dir = os.path.join(save_dir, os.path.splitext(zip_name)[0])
-    shutil.unpack_archive(save_path, unpack_dir)
 
 
 @unittest.skipIf(not torch.cuda.is_available(), 'cuda unittest')
@@ -45,46 +31,33 @@ class EasyCVTrainerTestSegformer(unittest.TestCase):
         shutil.rmtree(self.tmp_dir, ignore_errors=True)
 
     def _train(self):
-        from modelscope.trainers.easycv.trainer import EasyCVEpochBasedTrainer
-
-        url = 'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/datasets/small_coco_stuff164k.zip'
-        data_dir = os.path.join(self.tmp_dir, 'data')
-        if is_master():
-            _download_data(url, data_dir)
-
-        # adapt to ditributed mode
-        from easycv.utils.test_util import pseudo_dist_init
-        pseudo_dist_init()
 
-        root_path = os.path.join(data_dir, 'small_coco_stuff164k')
         cfg_options = {
-            'train.max_epochs':
-            2,
-            'dataset.train.data_source.img_root':
-            os.path.join(root_path, 'train2017'),
-            'dataset.train.data_source.label_root':
-            os.path.join(root_path, 'annotations/train2017'),
-            'dataset.train.data_source.split':
-            os.path.join(root_path, 'train.txt'),
-            'dataset.val.data_source.img_root':
-            os.path.join(root_path, 'val2017'),
-            'dataset.val.data_source.label_root':
-            os.path.join(root_path, 'annotations/val2017'),
-            'dataset.val.data_source.split':
-            os.path.join(root_path, 'val.txt'),
+            'train.max_epochs': 2,
+            'model.decode_head.norm_cfg.type': 'BN'
         }
 
         trainer_name = Trainers.easycv
+        train_dataset = MsDataset.load(
+            dataset_name='small_coco_stuff164k',
+            namespace='EasyCV',
+            split='train')
+        eval_dataset = MsDataset.load(
+            dataset_name='small_coco_stuff164k',
+            namespace='EasyCV',
+            split='validation')
         kwargs = dict(
-            task=Tasks.image_segmentation,
-            model='EasyCV/EasyCV-Segformer-b0',
+            model=
+            'damo/cv_segformer-b0_image_semantic-segmentation_coco-stuff164k',
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
             work_dir=self.tmp_dir,
             cfg_options=cfg_options)
 
         trainer = build_trainer(trainer_name, kwargs)
         trainer.train()
 
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_single_gpu_segformer(self):
         self._train()
 
diff --git a/tests/trainers/test_dialog_intent_trainer.py b/tests/trainers/test_dialog_intent_trainer.py
new file mode 100644
index 00000000..207387ac
--- /dev/null
+++ b/tests/trainers/test_dialog_intent_trainer.py
@@ -0,0 +1,103 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+import shutil
+import tempfile
+import unittest
+
+import json
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.metainfo import Trainers
+from modelscope.msdatasets import MsDataset
+from modelscope.trainers import build_trainer
+from modelscope.utils.config import Config
+from modelscope.utils.constant import DownloadMode, ModelFile, Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class TestDialogIntentTrainer(unittest.TestCase):
+
+    def setUp(self):
+        self.save_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.save_dir):
+            os.mkdir(self.save_dir)
+
+    def tearDown(self):
+        shutil.rmtree(self.save_dir)
+        super().tearDown()
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_trainer_with_model_and_args(self):
+        model_id = 'damo/nlp_space_pretrained-dialog-model'
+        data_banking = MsDataset.load('banking77')
+        self.data_dir = data_banking._hf_ds.config_kwargs['split_config'][
+            'train']
+        self.model_dir = snapshot_download(model_id)
+        self.debugging = True
+        kwargs = dict(
+            model_dir=self.model_dir,
+            cfg_name='intent_train_config.json',
+            cfg_modify_fn=self.cfg_modify_fn)
+        trainer = build_trainer(
+            name=Trainers.dialog_intent_trainer, default_args=kwargs)
+        trainer.train()
+
+    def cfg_modify_fn(self, cfg):
+        config = {
+            'num_intent': 77,
+            'BPETextField': {
+                'vocab_path': '',
+                'data_name': 'banking77',
+                'data_root': self.data_dir,
+                'understand': True,
+                'generation': False,
+                'max_len': 256
+            },
+            'Dataset': {
+                'data_dir': self.data_dir,
+                'with_contrastive': False,
+                'trigger_role': 'user',
+                'trigger_data': 'banking'
+            },
+            'Trainer': {
+                'can_norm': True,
+                'seed': 11,
+                'gpu': 1,
+                'save_dir': self.save_dir,
+                'batch_size_label': 128,
+                'batch_size_nolabel': 0,
+                'log_steps': 20
+            },
+            'Model': {
+                'init_checkpoint': self.model_dir,
+                'model': 'IntentUnifiedTransformer',
+                'example': False,
+                'num_intent': 77,
+                'with_rdrop': True,
+                'num_turn_embeddings': 21,
+                'dropout': 0.25,
+                'kl_ratio': 5.0,
+                'embed_dropout': 0.25,
+                'attn_dropout': 0.25,
+                'ff_dropout': 0.25,
+                'with_pool': False,
+                'warmup_steps': -1
+            }
+        }
+        cfg.BPETextField.vocab_path = os.path.join(self.model_dir,
+                                                   ModelFile.VOCAB_FILE)
+        cfg.num_intent = 77
+        cfg.Trainer.update(config['Trainer'])
+        cfg.BPETextField.update(config['BPETextField'])
+        cfg.Dataset.update(config['Dataset'])
+        cfg.Model.update(config['Model'])
+        if self.debugging:
+            cfg.Trainer.save_checkpoint = False
+            cfg.Trainer.num_epochs = 5
+            cfg.Trainer.batch_size_label = 64
+        return cfg
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/trainers/test_dialog_modeling_trainer.py b/tests/trainers/test_dialog_modeling_trainer.py
new file mode 100644
index 00000000..be03db30
--- /dev/null
+++ b/tests/trainers/test_dialog_modeling_trainer.py
@@ -0,0 +1,68 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import unittest
+
+import torch
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.metainfo import Preprocessors, Trainers
+from modelscope.msdatasets import MsDataset
+from modelscope.trainers import build_trainer
+from modelscope.utils.constant import DownloadMode, ModelFile
+from modelscope.utils.test_utils import test_level
+
+
+class TestDialogModelingTrainer(unittest.TestCase):
+
+    model_id = 'damo/nlp_space_pretrained-dialog-model'
+    output_dir = './dialog_fintune_result'
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_trainer_with_model_and_args(self):
+        # download data set
+        data_multiwoz = MsDataset.load(
+            'MultiWoz2.0', download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS)
+        data_dir = os.path.join(
+            data_multiwoz._hf_ds.config_kwargs['split_config']['train'],
+            'data')
+
+        # download model
+        model_dir = snapshot_download(self.model_id)
+
+        # dialog finetune config
+        def cfg_modify_fn(cfg):
+            config = {
+                'seed': 10,
+                'gpu': 4,
+                'use_data_distributed': False,
+                'valid_metric_name': '-loss',
+                'num_epochs': 60,
+                'save_dir': self.output_dir,
+                'token_loss': True,
+                'batch_size': 32,
+                'log_steps': 10,
+                'valid_steps': 0,
+                'save_checkpoint': True,
+                'save_summary': False,
+                'shuffle': True,
+                'sort_pool_size': 0
+            }
+
+            cfg.Trainer = config
+            cfg.use_gpu = torch.cuda.is_available() and config['gpu'] >= 1
+            return cfg
+
+        # trainer config
+        kwargs = dict(
+            model_dir=model_dir,
+            cfg_name='gen_train_config.json',
+            data_dir=data_dir,
+            cfg_modify_fn=cfg_modify_fn)
+
+        trainer = build_trainer(
+            name=Trainers.dialog_modeling_trainer, default_args=kwargs)
+        trainer.train()
+        checkpoint_path = os.path.join(self.output_dir,
+                                       ModelFile.TORCH_MODEL_BIN_FILE)
+        assert os.path.exists(checkpoint_path)
+        trainer.evaluate(checkpoint_path=checkpoint_path)
diff --git a/tests/trainers/test_finetune_passage_ranking.py b/tests/trainers/test_finetune_passage_ranking.py
new file mode 100644
index 00000000..f833f981
--- /dev/null
+++ b/tests/trainers/test_finetune_passage_ranking.py
@@ -0,0 +1,133 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import shutil
+import tempfile
+import unittest
+from typing import Any, Callable, Dict, List, NewType, Optional, Tuple, Union
+
+import torch
+from transformers.tokenization_utils_base import PreTrainedTokenizerBase
+
+from modelscope.metainfo import Trainers
+from modelscope.models import Model
+from modelscope.msdatasets import MsDataset
+from modelscope.pipelines import pipeline
+from modelscope.trainers import build_trainer
+from modelscope.utils.constant import ModelFile, Tasks
+
+
+class TestFinetuneSequenceClassification(unittest.TestCase):
+    inputs = {
+        'source_sentence': ["how long it take to get a master's degree"],
+        'sentences_to_compare': [
+            "On average, students take about 18 to 24 months to complete a master's degree.",
+            'On the other hand, some students prefer to go at a slower pace and choose to take '
+            'several years to complete their studies.',
+            'It can take anywhere from two semesters'
+        ]
+    }
+
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp_dir)
+        super().tearDown()
+
+    def finetune(self,
+                 model_id,
+                 train_dataset,
+                 eval_dataset,
+                 name=Trainers.nlp_passage_ranking_trainer,
+                 cfg_modify_fn=None,
+                 **kwargs):
+        kwargs = dict(
+            model=model_id,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            work_dir=self.tmp_dir,
+            cfg_modify_fn=cfg_modify_fn,
+            **kwargs)
+
+        os.environ['LOCAL_RANK'] = '0'
+        trainer = build_trainer(name=name, default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+
+    def test_finetune_msmarco(self):
+
+        def cfg_modify_fn(cfg):
+            cfg.task = 'passage-ranking'
+            cfg['preprocessor'] = {'type': 'passage-ranking'}
+            cfg.train.optimizer.lr = 2e-5
+            cfg['dataset'] = {
+                'train': {
+                    'type': 'bert',
+                    'query_sequence': 'query',
+                    'pos_sequence': 'positive_passages',
+                    'neg_sequence': 'negative_passages',
+                    'passage_text_fileds': ['title', 'text'],
+                    'qid_field': 'query_id'
+                },
+                'val': {
+                    'type': 'bert',
+                    'query_sequence': 'query',
+                    'pos_sequence': 'positive_passages',
+                    'neg_sequence': 'negative_passages',
+                    'passage_text_fileds': ['title', 'text'],
+                    'qid_field': 'query_id'
+                },
+            }
+            cfg['train']['neg_samples'] = 4
+            cfg['evaluation']['dataloader']['batch_size_per_gpu'] = 30
+            cfg.train.max_epochs = 1
+            cfg.train.train_batch_size = 4
+            cfg.train.lr_scheduler = {
+                'type': 'LinearLR',
+                'start_factor': 1.0,
+                'end_factor': 0.0,
+                'options': {
+                    'by_epoch': False
+                }
+            }
+            cfg.train.hooks = [{
+                'type': 'CheckpointHook',
+                'interval': 1
+            }, {
+                'type': 'TextLoggerHook',
+                'interval': 1
+            }, {
+                'type': 'IterTimerHook'
+            }, {
+                'type': 'EvaluationHook',
+                'by_epoch': False,
+                'interval': 3000
+            }]
+            return cfg
+
+        # load dataset
+        ds = MsDataset.load('passage-ranking-demo', 'zyznull')
+        train_ds = ds['train'].to_hf_dataset()
+        dev_ds = ds['train'].to_hf_dataset()
+
+        self.finetune(
+            model_id='damo/nlp_corom_passage-ranking_english-base',
+            train_dataset=train_ds,
+            eval_dataset=dev_ds,
+            cfg_modify_fn=cfg_modify_fn)
+
+        output_dir = os.path.join(self.tmp_dir, ModelFile.TRAIN_OUTPUT_DIR)
+        self.pipeline_passage_ranking(output_dir)
+
+    def pipeline_passage_ranking(self, model_dir):
+        model = Model.from_pretrained(model_dir)
+        pipeline_ins = pipeline(task=Tasks.passage_ranking, model=model)
+        print(pipeline_ins(input=self.inputs))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/trainers/test_finetune_sequence_classification.py b/tests/trainers/test_finetune_sequence_classification.py
index 24f1a2fd..f2adfa22 100644
--- a/tests/trainers/test_finetune_sequence_classification.py
+++ b/tests/trainers/test_finetune_sequence_classification.py
@@ -10,11 +10,14 @@ from modelscope.msdatasets import MsDataset
 from modelscope.pipelines import pipeline
 from modelscope.trainers import build_trainer
 from modelscope.trainers.hooks import Hook
-from modelscope.trainers.nlp_trainer import NlpEpochBasedTrainer
+from modelscope.trainers.nlp_trainer import (EpochBasedTrainer,
+                                             NlpEpochBasedTrainer)
 from modelscope.trainers.optimizer.child_tuning_adamw_optimizer import \
     calculate_fisher
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.data_utils import to_device
+from modelscope.utils.regress_test_utils import MsRegressTool
+from modelscope.utils.test_utils import test_level
 
 
 class TestFinetuneSequenceClassification(unittest.TestCase):
@@ -28,11 +31,76 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
         self.tmp_dir = tempfile.TemporaryDirectory().name
         if not os.path.exists(self.tmp_dir):
             os.makedirs(self.tmp_dir)
+        self.regress_tool = MsRegressTool(baseline=False)
 
     def tearDown(self):
         shutil.rmtree(self.tmp_dir)
         super().tearDown()
 
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_trainer_repeatable(self):
+        import torch  # noqa
+
+        def cfg_modify_fn(cfg):
+            cfg.task = 'nli'
+            cfg['preprocessor'] = {'type': 'nli-tokenizer'}
+            cfg.train.optimizer.lr = 2e-5
+            cfg['dataset'] = {
+                'train': {
+                    'labels': [
+                        '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
+                        '11', '12', '13', '14'
+                    ],
+                    'first_sequence':
+                    'sentence',
+                    'label':
+                    'label',
+                }
+            }
+            cfg.train.max_epochs = 5
+            cfg.train.lr_scheduler = {
+                'type': 'LinearLR',
+                'start_factor': 1.0,
+                'end_factor': 0.0,
+                'total_iters':
+                int(len(dataset['train']) / 32) * cfg.train.max_epochs,
+                'options': {
+                    'by_epoch': False
+                }
+            }
+            cfg.train.hooks = [{
+                'type': 'CheckpointHook',
+                'interval': 1
+            }, {
+                'type': 'TextLoggerHook',
+                'interval': 1
+            }, {
+                'type': 'IterTimerHook'
+            }, {
+                'type': 'EvaluationHook',
+                'by_epoch': False,
+                'interval': 100
+            }]
+            return cfg
+
+        dataset = MsDataset.load('clue', subset_name='tnews')
+
+        kwargs = dict(
+            model='damo/nlp_structbert_backbone_base_std',
+            train_dataset=dataset['train'],
+            eval_dataset=dataset['validation'],
+            work_dir=self.tmp_dir,
+            seed=42,
+            cfg_modify_fn=cfg_modify_fn)
+
+        os.environ['LOCAL_RANK'] = '0'
+        trainer: EpochBasedTrainer = build_trainer(
+            name=Trainers.nlp_base_trainer, default_args=kwargs)
+
+        with self.regress_tool.monitor_ms_train(
+                trainer, 'sbert-base-tnews', level='strict'):
+            trainer.train()
+
     def finetune(self,
                  model_id,
                  train_dataset,
@@ -54,7 +122,7 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
         results_files = os.listdir(self.tmp_dir)
         self.assertIn(f'{trainer.timestamp}.log.json', results_files)
         for i in range(self.epoch_num):
-            self.assertIn(f'epoch_{i+1}.pth', results_files)
+            self.assertIn(f'epoch_{i + 1}.pth', results_files)
 
         output_files = os.listdir(
             os.path.join(self.tmp_dir, ModelFile.TRAIN_OUTPUT_DIR))
@@ -118,11 +186,7 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
             }]
             return cfg
 
-        from datasets import load_dataset
-        from datasets import DownloadConfig
-        dc = DownloadConfig()
-        dc.local_files_only = True
-        dataset = load_dataset('clue', 'afqmc', download_config=dc)
+        dataset = MsDataset.load('clue', subset_name='afqmc')
         self.finetune(
             model_id='damo/nlp_structbert_backbone_base_std',
             train_dataset=dataset['train'],
@@ -182,11 +246,7 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
             }]
             return cfg
 
-        from datasets import load_dataset
-        from datasets import DownloadConfig
-        dc = DownloadConfig()
-        dc.local_files_only = True
-        dataset = load_dataset('clue', 'tnews', download_config=dc)
+        dataset = MsDataset.load('clue', subset_name='tnews')
 
         self.finetune(
             model_id='damo/nlp_structbert_backbone_base_std',
diff --git a/tests/trainers/test_finetune_text_generation.py b/tests/trainers/test_finetune_text_generation.py
index a561effe..6aefa969 100644
--- a/tests/trainers/test_finetune_text_generation.py
+++ b/tests/trainers/test_finetune_text_generation.py
@@ -129,7 +129,7 @@ class TestFinetuneTextGeneration(unittest.TestCase):
     @unittest.skip
     def test_finetune_cnndm(self):
         from modelscope.msdatasets import MsDataset
-        dataset_dict = MsDataset.load('dureader_robust_qg')
+        dataset_dict = MsDataset.load('DuReader_robust-QG')
         train_dataset = dataset_dict['train'].to_hf_dataset() \
             .rename_columns({'text1': 'src_txt', 'text2': 'tgt_txt'})
         eval_dataset = dataset_dict['validation'].to_hf_dataset() \
diff --git a/tests/trainers/test_trainer.py b/tests/trainers/test_trainer.py
index 86909f74..c73a56a3 100644
--- a/tests/trainers/test_trainer.py
+++ b/tests/trainers/test_trainer.py
@@ -64,7 +64,7 @@ class TrainerTest(unittest.TestCase):
         super().tearDown()
         shutil.rmtree(self.tmp_dir)
 
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_train_0(self):
         json_cfg = {
             'task': Tasks.image_classification,
@@ -139,7 +139,7 @@ class TrainerTest(unittest.TestCase):
         self.assertIn(f'{LogKeys.EPOCH}_2.pth', results_files)
         self.assertIn(f'{LogKeys.EPOCH}_3.pth', results_files)
 
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_train_1(self):
         json_cfg = {
             'task': Tasks.image_classification,
@@ -200,7 +200,7 @@ class TrainerTest(unittest.TestCase):
         self.assertIn(f'{LogKeys.EPOCH}_2.pth', results_files)
         self.assertIn(f'{LogKeys.EPOCH}_3.pth', results_files)
 
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_train_with_default_config(self):
         json_cfg = {
             'task': Tasks.image_classification,
@@ -319,7 +319,7 @@ class TrainerTest(unittest.TestCase):
         for i in [2, 5, 8]:
             self.assertIn(MetricKeys.ACCURACY, lines[i])
 
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_train_with_iters_per_epoch(self):
         json_cfg = {
             'task': Tasks.image_classification,
@@ -441,7 +441,7 @@ class TrainerTest(unittest.TestCase):
 
 class DummyTrainerTest(unittest.TestCase):
 
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_dummy(self):
         default_args = dict(cfg_file='configs/examples/train.json')
         trainer = build_trainer('dummy', default_args)
diff --git a/tests/trainers/test_trainer_gpu.py b/tests/trainers/test_trainer_gpu.py
index 3777772d..1f622287 100644
--- a/tests/trainers/test_trainer_gpu.py
+++ b/tests/trainers/test_trainer_gpu.py
@@ -17,7 +17,7 @@ from modelscope.metainfo import Metrics, Trainers
 from modelscope.metrics.builder import MetricKeys
 from modelscope.models.base import Model
 from modelscope.trainers import EpochBasedTrainer, build_trainer
-from modelscope.utils.constant import LogKeys, ModeKeys, ModelFile
+from modelscope.utils.constant import LogKeys, ModeKeys, ModelFile, Tasks
 from modelscope.utils.test_utils import (DistributedTestCase,
                                          create_dummy_test_dataset, test_level)
 
@@ -55,6 +55,7 @@ class DummyModel(nn.Module, Model):
 
 def train_func(work_dir, dist=False, iterable_dataset=False, **kwargs):
     json_cfg = {
+        'task': Tasks.image_classification,
         'train': {
             'work_dir': work_dir,
             'dataloader': {
@@ -119,7 +120,7 @@ class TrainerTestSingleGpu(unittest.TestCase):
         super().tearDown()
         shutil.rmtree(self.tmp_dir)
 
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_single_gpu(self):
         train_func(self.tmp_dir)
 
diff --git a/tests/trainers/test_trainer_with_nlp.py b/tests/trainers/test_trainer_with_nlp.py
index 2cf1c152..6030ada9 100644
--- a/tests/trainers/test_trainer_with_nlp.py
+++ b/tests/trainers/test_trainer_with_nlp.py
@@ -11,7 +11,8 @@ from modelscope.models.nlp.sequence_classification import \
     SbertForSequenceClassification
 from modelscope.msdatasets import MsDataset
 from modelscope.pipelines import pipeline
-from modelscope.trainers import build_trainer
+from modelscope.trainers import EpochBasedTrainer, build_trainer
+from modelscope.utils.config import Config
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.hub import read_config
 from modelscope.utils.test_utils import test_level
@@ -119,6 +120,90 @@ class TestTrainerWithNlp(unittest.TestCase):
             checkpoint_path=os.path.join(self.tmp_dir, 'epoch_10.pth'))
         self.assertTrue(Metrics.accuracy in eval_results)
 
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_trainer_with_configured_datasets(self):
+        model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base'
+        cfg: Config = read_config(model_id)
+        cfg.train.max_epochs = 20
+        cfg.train.work_dir = self.tmp_dir
+        cfg.dataset = {
+            'train': {
+                'name': 'afqmc_small',
+                'split': 'train',
+                'namespace': 'userxiaoming'
+            },
+            'val': {
+                'name': 'afqmc_small',
+                'split': 'train',
+                'namespace': 'userxiaoming'
+            },
+        }
+        cfg_file = os.path.join(self.tmp_dir, 'config.json')
+        cfg.dump(cfg_file)
+        kwargs = dict(model=model_id, cfg_file=cfg_file)
+
+        trainer = build_trainer(default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        for i in range(cfg.train.max_epochs):
+            self.assertIn(f'epoch_{i+1}.pth', results_files)
+
+        eval_results = trainer.evaluate(
+            checkpoint_path=os.path.join(self.tmp_dir, 'epoch_10.pth'))
+        self.assertTrue(Metrics.accuracy in eval_results)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_trainer_with_continue_train(self):
+        from modelscope.utils.regress_test_utils import MsRegressTool
+        model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base'
+        cfg: Config = read_config(model_id)
+        cfg.train.max_epochs = 3
+        cfg.train.work_dir = self.tmp_dir
+        cfg_file = os.path.join(self.tmp_dir, 'config.json')
+        cfg.dump(cfg_file)
+        dataset = MsDataset.load('clue', subset_name='afqmc', split='train')
+        dataset = dataset.to_hf_dataset().select(range(128))
+        kwargs = dict(
+            model=model_id,
+            train_dataset=dataset,
+            eval_dataset=dataset,
+            cfg_file=cfg_file)
+
+        regress_tool = MsRegressTool(baseline=True)
+        trainer: EpochBasedTrainer = build_trainer(default_args=kwargs)
+
+        def lazy_stop_callback():
+            from modelscope.trainers.hooks.hook import Hook, Priority
+
+            class EarlyStopHook(Hook):
+                PRIORITY = Priority.VERY_LOW
+
+                def after_iter(self, trainer):
+                    if trainer.iter == 12:
+                        raise MsRegressTool.EarlyStopError('Test finished.')
+
+            if 'EarlyStopHook' not in [
+                    hook.__class__.__name__ for hook in trainer.hooks
+            ]:
+                trainer.register_hook(EarlyStopHook())
+
+        with regress_tool.monitor_ms_train(
+                trainer,
+                'trainer_continue_train',
+                level='strict',
+                lazy_stop_callback=lazy_stop_callback):
+            trainer.train()
+
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+
+        trainer = build_trainer(default_args=kwargs)
+        regress_tool = MsRegressTool(baseline=False)
+        with regress_tool.monitor_ms_train(
+                trainer, 'trainer_continue_train', level='strict'):
+            trainer.train(os.path.join(self.tmp_dir, 'iter_12.pth'))
+
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_trainer_with_model_and_args(self):
         tmp_dir = tempfile.TemporaryDirectory().name
diff --git a/tests/utils/__init__.py b/tests/utils/__init__.py
index 9166292f..f1a50035 100644
--- a/tests/utils/__init__.py
+++ b/tests/utils/__init__.py
@@ -1 +1,3 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from .profiler import *  # noqa F403
diff --git a/tests/utils/profiler.py b/tests/utils/profiler.py
index 92708ad3..f5a522ef 100644
--- a/tests/utils/profiler.py
+++ b/tests/utils/profiler.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import importlib
 import sys
 from functools import wraps
diff --git a/tests/utils/test_device.py b/tests/utils/test_device.py
index 4def9915..0d334fda 100644
--- a/tests/utils/test_device.py
+++ b/tests/utils/test_device.py
@@ -50,6 +50,12 @@ class DeviceTest(unittest.TestCase):
         with self.assertRaises(AssertionError):
             verify_device('xgu')
 
+        with self.assertRaises(AssertionError):
+            verify_device('')
+
+        with self.assertRaises(AssertionError):
+            verify_device(None)
+
     def test_create_device_torch(self):
         if torch.cuda.is_available():
             target_device_type = 'cuda'