From 303ae2ff36d1cfa23abdadb59dfaa4d25b9bfb82 Mon Sep 17 00:00:00 2001
From: pangda <pangda@alibaba-inc.com>
Date: Fri, 28 Oct 2022 15:26:17 +0800
Subject: [PATCH 01/46] [to #42322933] fix bug for text logger         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10560149

---
 modelscope/trainers/hooks/logger/text_logger_hook.py | 2 +-
 modelscope/trainers/trainer.py                       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/modelscope/trainers/hooks/logger/text_logger_hook.py b/modelscope/trainers/hooks/logger/text_logger_hook.py
index 8552ab4e..95644783 100644
--- a/modelscope/trainers/hooks/logger/text_logger_hook.py
+++ b/modelscope/trainers/hooks/logger/text_logger_hook.py
@@ -61,7 +61,7 @@ class TextLoggerHook(LoggerHook):
         self.json_log_path = osp.join(self.out_dir,
                                       '{}.log.json'.format(trainer.timestamp))
         if hasattr(trainer, 'meta') and trainer.meta is not None:
-            self._dump_log(trainer.meta, trainer)
+            self._dump_log(trainer.meta)
 
     def _get_max_memory(self, trainer):
         device = getattr(trainer.model, 'output_device', None)
diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py
index e1fd7522..aaf24cfa 100644
--- a/modelscope/trainers/trainer.py
+++ b/modelscope/trainers/trainer.py
@@ -183,7 +183,7 @@ class EpochBasedTrainer(BaseTrainer):
             preprocessor=self.eval_preprocessor,
             **kwargs)
 
-        self.train_data_collator, self.eval_default_collate = None, None
+        self.train_data_collator, self.eval_data_collator = None, None
         if isinstance(data_collator, Mapping):
             if not (ConfigKeys.train in data_collator
                     or ConfigKeys.val in data_collator):

From 84ed59d8578aa0a1b041822dc267c4289a4c1e13 Mon Sep 17 00:00:00 2001
From: "lingcai.wl" <lingcai.wl@alibaba-inc.com>
Date: Fri, 28 Oct 2022 16:10:50 +0800
Subject: [PATCH 02/46] [to #44834022] add service utils for model deploy

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10529621
---
 modelscope/utils/demo_utils.py         |  17 +--
 modelscope/utils/regress_test_utils.py |  15 +--
 modelscope/utils/service_utils.py      | 179 +++++++++++++++++++++++++
 3 files changed, 182 insertions(+), 29 deletions(-)
 create mode 100644 modelscope/utils/service_utils.py

diff --git a/modelscope/utils/demo_utils.py b/modelscope/utils/demo_utils.py
index 363ae950..e57b3348 100644
--- a/modelscope/utils/demo_utils.py
+++ b/modelscope/utils/demo_utils.py
@@ -4,11 +4,11 @@ import io
 
 import cv2
 import json
-import numpy as np
 
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks, TasksIODescriptions
+from modelscope.utils.service_utils import NumpyEncoder
 
 TASKS_INPUT_TEMPLATES = {
     # vision tasks
@@ -234,21 +234,6 @@ class DemoCompatibilityCheck(object):
         return True
 
 
-class NumpyEncoder(json.JSONEncoder):
-
-    def default(self, obj):
-        if isinstance(obj, np.ndarray):
-            return obj.tolist()
-
-        if isinstance(obj, np.floating):
-            return float(obj)
-
-        if isinstance(obj, np.integer):
-            return int(obj)
-
-        return json.JSONEncoder.default(self, obj)
-
-
 def preprocess(req):
     in_urls = req.get('urlPaths').get('inUrls')
     if len(req['inputs']) == 1:
diff --git a/modelscope/utils/regress_test_utils.py b/modelscope/utils/regress_test_utils.py
index 8045d3e9..be983c6c 100644
--- a/modelscope/utils/regress_test_utils.py
+++ b/modelscope/utils/regress_test_utils.py
@@ -19,6 +19,8 @@ import torch
 import torch.optim
 from torch import nn
 
+from modelscope.utils.service_utils import NumpyEncoder
+
 
 class RegressTool:
     """This class is used to stop inference/training results from changing by some unaware affections by unittests.
@@ -117,19 +119,6 @@ class RegressTool:
             with open(baseline, 'rb') as f:
                 base = pickle.load(f)
 
-            class NumpyEncoder(json.JSONEncoder):
-                """Special json encoder for numpy types
-                """
-
-                def default(self, obj):
-                    if isinstance(obj, np.integer):
-                        return int(obj)
-                    elif isinstance(obj, np.floating):
-                        return float(obj)
-                    elif isinstance(obj, np.ndarray):
-                        return obj.tolist()
-                    return json.JSONEncoder.default(self, obj)
-
             print(f'baseline: {json.dumps(base, cls=NumpyEncoder)}')
             print(f'latest  : {json.dumps(io_json, cls=NumpyEncoder)}')
             if not compare_io_and_print(base, io_json, compare_fn, **kwargs):
diff --git a/modelscope/utils/service_utils.py b/modelscope/utils/service_utils.py
new file mode 100644
index 00000000..29c111f8
--- /dev/null
+++ b/modelscope/utils/service_utils.py
@@ -0,0 +1,179 @@
+import base64
+import mimetypes
+from io import BytesIO
+
+import json
+import numpy as np
+import requests
+from PIL import Image
+
+from modelscope.outputs import TASK_OUTPUTS, OutputKeys
+from modelscope.pipeline_inputs import TASK_INPUTS, InputType
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks, TasksIODescriptions
+
+
+# service data decoder func decodes data from network and convert it to pipeline's input
+# for example
+def ExampleDecoder(data):
+    # Assuming the pipeline inputs is a dict contains an image and a text,
+    # to decode the data from network we decode the image as base64
+    data_json = json.loads(data)
+    # data: {"image": "xxxxxxxx=="(base64 str), "text": "a question"}
+    # pipeline(inputs) as follows:
+    # pipeline({'image': image, 'text': text})
+    inputs = {
+        'image': decode_base64_to_image(data_json.get('image')),
+        'text': data_json.get('text')
+    }
+    return inputs
+
+
+# service data encoder func encodes data from pipeline outputs and convert to network response (such as json)
+# for example
+def ExampleEncoder(data):
+    # Assuming the pipeline outputs is a dict contains an image and a text,
+    # and transmit it through network, this func encode image to base64 and dumps into json
+    # data (for e.g. python dict):
+    # {"image": a numpy array represents a image, "text": "output"}
+    image = data['image']
+    text = data['text']
+    data = {'image': encode_array_to_img_base64(image), 'text': text}
+    return json.dumps(data, cls=NumpyEncoder)
+
+
+CustomEncoder = {
+    # Tasks.visual_question_answering: ExampleEncoder
+}
+
+CustomDecoder = {
+    # Tasks.visual_question_answering: ExampleDecoder
+}
+
+
+class NumpyEncoder(json.JSONEncoder):
+
+    def default(self, obj):
+        if isinstance(obj, np.ndarray):
+            return obj.tolist()
+
+        if isinstance(obj, np.floating):
+            return float(obj)
+
+        if isinstance(obj, np.integer):
+            return int(obj)
+
+        return json.JSONEncoder.default(self, obj)
+
+
+def get_extension(encoding):
+    encoding = encoding.replace('audio/wav', 'audio/x-wav')
+    tp = mimetypes.guess_type(encoding)[0]
+    if tp == 'audio/flac':  # flac is not supported by mimetypes
+        return 'flac'
+    extension = mimetypes.guess_extension(tp)
+    if extension is not None and extension.startswith('.'):
+        extension = extension[1:]
+    return extension
+
+
+def get_mimetype(filename):
+    mimetype = mimetypes.guess_type(filename)[0]
+    if mimetype is not None:
+        mimetype = mimetype.replace('x-wav', 'wav').replace('x-flac', 'flac')
+    return mimetype
+
+
+def decode_base64_to_binary(encoding):
+    extension = get_extension(encoding)
+    data = encoding.split(',')[1]
+    return base64.b64decode(data), extension
+
+
+def decode_base64_to_image(encoding):
+    content = encoding.split(';')[1]
+    image_encoded = content.split(',')[1]
+    return Image.open(BytesIO(base64.b64decode(image_encoded)))
+
+
+def encode_array_to_img_base64(image_array):
+    with BytesIO() as output_bytes:
+        pil_image = Image.fromarray(image_array.astype(np.uint8))
+        pil_image.save(output_bytes, 'PNG')
+        bytes_data = output_bytes.getvalue()
+    base64_str = str(base64.b64encode(bytes_data), 'utf-8')
+    return 'data:image/png;base64,' + base64_str
+
+
+def encode_pcm_to_base64(bytes_data):
+    from scipy.io.wavfile import write
+    with BytesIO() as out_mem_file:
+        write(out_mem_file, 16000, bytes_data)
+        base64_str = str(base64.b64encode(out_mem_file.getvalue()), 'utf-8')
+    return 'data:audio/pcm;base64,' + base64_str
+
+
+def encode_url_to_base64(url):
+    encoded_string = base64.b64encode(requests.get(url).content)
+    base64_str = str(encoded_string, 'utf-8')
+    mimetype = get_mimetype(url)
+    return ('data:' + (mimetype if mimetype is not None else '') + ';base64,'
+            + base64_str)
+
+
+def encode_file_to_base64(f):
+    with open(f, 'rb') as file:
+        encoded_string = base64.b64encode(file.read())
+        base64_str = str(encoded_string, 'utf-8')
+        mimetype = get_mimetype(f)
+        return ('data:' + (mimetype if mimetype is not None else '')
+                + ';base64,' + base64_str)
+
+
+def encode_url_or_file_to_base64(path):
+    try:
+        requests.get(path)
+        return encode_url_to_base64(path)
+    except (requests.exceptions.MissingSchema,
+            requests.exceptions.InvalidSchema):
+        return encode_file_to_base64(path)
+
+
+def service_data_decoder(task, data):
+    if CustomDecoder.get(task) is not None:
+        return CustomDecoder[task](data)
+    input_type = TASK_INPUTS[task]
+    input_data = data.decode('utf-8')
+    if input_type == InputType.IMAGE:
+        return decode_base64_to_image(input_data)
+    elif input_type == InputType.AUDIO:
+        return decode_base64_to_binary(input_data)[0]
+    elif input_type == InputType.TEXT:
+        return input_data
+    elif isinstance(input_type, dict):
+        input_data = {}
+        for key, val in input_type.items():
+            if val == InputType.IMAGE:
+                input_data[key] = decode_base64_to_image(data[key])
+            elif val == InputType.AUDIO:
+                input_data[key] = decode_base64_to_binary(data[key])[0]
+            elif val == InputType.TEXT:
+                input_data[key] = data[key]
+
+    return input_data
+
+
+def service_data_encoder(task, data):
+    if CustomEncoder.get(task) is not None:
+        return CustomEncoder[task](data)
+    output_keys = TASK_OUTPUTS[task]
+    result = data
+    for output_key in output_keys:
+        if output_key == OutputKeys.OUTPUT_IMG:
+            result[OutputKeys.OUTPUT_IMG] = encode_array_to_img_base64(
+                data[OutputKeys.OUTPUT_IMG][..., ::-1])
+        elif output_key == OutputKeys.OUTPUT_PCM:
+            result[OutputKeys.OUTPUT_PCM] = encode_pcm_to_base64(
+                data[OutputKeys.OUTPUT_PCM])
+    result = bytes(json.dumps(result, cls=NumpyEncoder), encoding='utf8')
+    return result

From 261c04b8b59527e3b10ae7bb8b37ea42a7d6510b Mon Sep 17 00:00:00 2001
From: Yufeng <47727949+shuaigezhu@users.noreply.github.com>
Date: Fri, 28 Oct 2022 17:09:27 +0800
Subject: [PATCH 03/46] add Mglm (#5)

* mglm init

* add mglm requirements

Co-authored-by: Yufeng <zhuyufeng@gmail.com>
Co-authored-by: wenmeng.zwm <wenmeng.zwm@alibaba-inc.com>
---
 modelscope/metainfo.py                        |    3 +
 modelscope/models/nlp/__init__.py             |    2 +
 modelscope/models/nlp/mglm/__init__.py        |   22 +
 modelscope/models/nlp/mglm/arguments.py       |  793 +++++++++
 modelscope/models/nlp/mglm/blocklm_utils.py   |  625 +++++++
 modelscope/models/nlp/mglm/configure_data.py  |  513 ++++++
 .../models/nlp/mglm/data_utils/__init__.py    |  341 ++++
 .../models/nlp/mglm/data_utils/corpora.py     |  583 ++++++
 .../models/nlp/mglm/data_utils/datasets.py    | 1244 +++++++++++++
 .../models/nlp/mglm/data_utils/extraction.py  |   71 +
 .../models/nlp/mglm/data_utils/file_utils.py  |  256 +++
 .../models/nlp/mglm/data_utils/lazy_loader.py |  286 +++
 .../models/nlp/mglm/data_utils/samplers.py    |  190 ++
 .../nlp/mglm/data_utils/sp_tokenizer.py       |  158 ++
 .../nlp/mglm/data_utils/tokenization.py       | 1396 +++++++++++++++
 .../nlp/mglm/data_utils/tokenization_gpt2.py  |  359 ++++
 .../models/nlp/mglm/data_utils/wordpiece.py   |  408 +++++
 modelscope/models/nlp/mglm/fp16/__init__.py   |   20 +
 modelscope/models/nlp/mglm/fp16/fp16.py       |  660 +++++++
 modelscope/models/nlp/mglm/fp16/fp16util.py   |  220 +++
 .../models/nlp/mglm/fp16/loss_scaler.py       |  245 +++
 .../models/nlp/mglm/generation_utils.py       |  483 +++++
 .../nlp/mglm/mglm_for_text_summarization.py   |  469 +++++
 modelscope/models/nlp/mglm/model/__init__.py  |   20 +
 .../models/nlp/mglm/model/distributed.py      |  127 ++
 .../models/nlp/mglm/model/downstream.py       |  242 +++
 .../models/nlp/mglm/model/modeling_bert.py    | 1576 +++++++++++++++++
 .../models/nlp/mglm/model/modeling_glm.py     |  245 +++
 modelscope/models/nlp/mglm/model/prompt.py    |   59 +
 modelscope/models/nlp/mglm/mpu/__init__.py    |   37 +
 .../models/nlp/mglm/mpu/cross_entropy.py      |  110 ++
 modelscope/models/nlp/mglm/mpu/data.py        |  117 ++
 modelscope/models/nlp/mglm/mpu/grads.py       |   72 +
 modelscope/models/nlp/mglm/mpu/initialize.py  |  130 ++
 modelscope/models/nlp/mglm/mpu/layers.py      |  357 ++++
 modelscope/models/nlp/mglm/mpu/mappings.py    |  144 ++
 modelscope/models/nlp/mglm/mpu/random.py      |  408 +++++
 .../models/nlp/mglm/mpu/tests/__init__.py     |    0
 .../models/nlp/mglm/mpu/tests/commons.py      |   86 +
 .../nlp/mglm/mpu/tests/test_cross_entropy.py  |  106 ++
 .../models/nlp/mglm/mpu/tests/test_data.py    |   91 +
 .../nlp/mglm/mpu/tests/test_initialize.py     |   95 +
 .../models/nlp/mglm/mpu/tests/test_layers.py  |  533 ++++++
 .../models/nlp/mglm/mpu/tests/test_random.py  |  206 +++
 modelscope/models/nlp/mglm/mpu/transformer.py | 1200 +++++++++++++
 modelscope/models/nlp/mglm/mpu/utils.py       |   70 +
 modelscope/models/nlp/mglm/process_grid.py    |   61 +
 modelscope/models/nlp/mglm/requirements.txt   |   22 +
 modelscope/models/nlp/mglm/run_test.py        |   10 +
 .../models/nlp/mglm/tasks/data_utils.py       |  389 ++++
 .../models/nlp/mglm/tasks/eval_utils.py       |  249 +++
 .../nlp/mglm/tasks/language_model/dataset.py  |  249 +++
 .../mglm/tasks/language_model/detokenizer.py  |   63 +
 .../nlp/mglm/tasks/language_model/finetune.py |  254 +++
 .../models/nlp/mglm/tasks/seq2seq/dataset.py  |  667 +++++++
 .../models/nlp/mglm/tasks/seq2seq/evaluate.py |  538 ++++++
 .../models/nlp/mglm/tasks/seq2seq/finetune.py |  151 ++
 .../models/nlp/mglm/tasks/superglue/README.md |  137 ++
 .../nlp/mglm/tasks/superglue/__init__.py      |    0
 .../nlp/mglm/tasks/superglue/dataset.py       | 1475 +++++++++++++++
 .../nlp/mglm/tasks/superglue/evaluate.py      |  101 ++
 .../nlp/mglm/tasks/superglue/finetune.py      |  138 ++
 .../models/nlp/mglm/tasks/superglue/pvp.py    | 1541 ++++++++++++++++
 modelscope/models/nlp/mglm/test/__init__.py   |    0
 modelscope/models/nlp/mglm/test/test_block.py |   36 +
 .../models/nlp/mglm/test/test_rel_shift.py    |   27 +
 modelscope/models/nlp/mglm/train_utils.py     |  472 +++++
 modelscope/models/nlp/mglm/utils.py           |  529 ++++++
 modelscope/outputs/outputs.py                 |    6 +
 modelscope/pipelines/nlp/__init__.py          |    2 +
 .../nlp/mglm_text_summarization_pipeline.py   |   43 +
 modelscope/preprocessors/__init__.py          |   19 +-
 modelscope/preprocessors/nlp/__init__.py      |    2 +
 .../nlp/mglm_summarization_preprocessor.py    |   32 +
 requirements/nlp.txt                          |   15 +-
 .../pipelines/test_mglm_text_summarization.py |   47 +
 76 files changed, 22640 insertions(+), 13 deletions(-)
 create mode 100644 modelscope/models/nlp/mglm/__init__.py
 create mode 100755 modelscope/models/nlp/mglm/arguments.py
 create mode 100644 modelscope/models/nlp/mglm/blocklm_utils.py
 create mode 100644 modelscope/models/nlp/mglm/configure_data.py
 create mode 100644 modelscope/models/nlp/mglm/data_utils/__init__.py
 create mode 100755 modelscope/models/nlp/mglm/data_utils/corpora.py
 create mode 100644 modelscope/models/nlp/mglm/data_utils/datasets.py
 create mode 100644 modelscope/models/nlp/mglm/data_utils/extraction.py
 create mode 100755 modelscope/models/nlp/mglm/data_utils/file_utils.py
 create mode 100644 modelscope/models/nlp/mglm/data_utils/lazy_loader.py
 create mode 100644 modelscope/models/nlp/mglm/data_utils/samplers.py
 create mode 100644 modelscope/models/nlp/mglm/data_utils/sp_tokenizer.py
 create mode 100755 modelscope/models/nlp/mglm/data_utils/tokenization.py
 create mode 100644 modelscope/models/nlp/mglm/data_utils/tokenization_gpt2.py
 create mode 100755 modelscope/models/nlp/mglm/data_utils/wordpiece.py
 create mode 100644 modelscope/models/nlp/mglm/fp16/__init__.py
 create mode 100755 modelscope/models/nlp/mglm/fp16/fp16.py
 create mode 100644 modelscope/models/nlp/mglm/fp16/fp16util.py
 create mode 100755 modelscope/models/nlp/mglm/fp16/loss_scaler.py
 create mode 100644 modelscope/models/nlp/mglm/generation_utils.py
 create mode 100644 modelscope/models/nlp/mglm/mglm_for_text_summarization.py
 create mode 100755 modelscope/models/nlp/mglm/model/__init__.py
 create mode 100755 modelscope/models/nlp/mglm/model/distributed.py
 create mode 100644 modelscope/models/nlp/mglm/model/downstream.py
 create mode 100644 modelscope/models/nlp/mglm/model/modeling_bert.py
 create mode 100644 modelscope/models/nlp/mglm/model/modeling_glm.py
 create mode 100644 modelscope/models/nlp/mglm/model/prompt.py
 create mode 100755 modelscope/models/nlp/mglm/mpu/__init__.py
 create mode 100644 modelscope/models/nlp/mglm/mpu/cross_entropy.py
 create mode 100644 modelscope/models/nlp/mglm/mpu/data.py
 create mode 100644 modelscope/models/nlp/mglm/mpu/grads.py
 create mode 100644 modelscope/models/nlp/mglm/mpu/initialize.py
 create mode 100644 modelscope/models/nlp/mglm/mpu/layers.py
 create mode 100644 modelscope/models/nlp/mglm/mpu/mappings.py
 create mode 100755 modelscope/models/nlp/mglm/mpu/random.py
 create mode 100644 modelscope/models/nlp/mglm/mpu/tests/__init__.py
 create mode 100644 modelscope/models/nlp/mglm/mpu/tests/commons.py
 create mode 100644 modelscope/models/nlp/mglm/mpu/tests/test_cross_entropy.py
 create mode 100644 modelscope/models/nlp/mglm/mpu/tests/test_data.py
 create mode 100644 modelscope/models/nlp/mglm/mpu/tests/test_initialize.py
 create mode 100644 modelscope/models/nlp/mglm/mpu/tests/test_layers.py
 create mode 100644 modelscope/models/nlp/mglm/mpu/tests/test_random.py
 create mode 100755 modelscope/models/nlp/mglm/mpu/transformer.py
 create mode 100644 modelscope/models/nlp/mglm/mpu/utils.py
 create mode 100644 modelscope/models/nlp/mglm/process_grid.py
 create mode 100644 modelscope/models/nlp/mglm/requirements.txt
 create mode 100644 modelscope/models/nlp/mglm/run_test.py
 create mode 100644 modelscope/models/nlp/mglm/tasks/data_utils.py
 create mode 100644 modelscope/models/nlp/mglm/tasks/eval_utils.py
 create mode 100644 modelscope/models/nlp/mglm/tasks/language_model/dataset.py
 create mode 100755 modelscope/models/nlp/mglm/tasks/language_model/detokenizer.py
 create mode 100644 modelscope/models/nlp/mglm/tasks/language_model/finetune.py
 create mode 100644 modelscope/models/nlp/mglm/tasks/seq2seq/dataset.py
 create mode 100644 modelscope/models/nlp/mglm/tasks/seq2seq/evaluate.py
 create mode 100644 modelscope/models/nlp/mglm/tasks/seq2seq/finetune.py
 create mode 100644 modelscope/models/nlp/mglm/tasks/superglue/README.md
 create mode 100644 modelscope/models/nlp/mglm/tasks/superglue/__init__.py
 create mode 100644 modelscope/models/nlp/mglm/tasks/superglue/dataset.py
 create mode 100644 modelscope/models/nlp/mglm/tasks/superglue/evaluate.py
 create mode 100644 modelscope/models/nlp/mglm/tasks/superglue/finetune.py
 create mode 100644 modelscope/models/nlp/mglm/tasks/superglue/pvp.py
 create mode 100644 modelscope/models/nlp/mglm/test/__init__.py
 create mode 100644 modelscope/models/nlp/mglm/test/test_block.py
 create mode 100644 modelscope/models/nlp/mglm/test/test_rel_shift.py
 create mode 100644 modelscope/models/nlp/mglm/train_utils.py
 create mode 100644 modelscope/models/nlp/mglm/utils.py
 create mode 100644 modelscope/pipelines/nlp/mglm_text_summarization_pipeline.py
 create mode 100644 modelscope/preprocessors/nlp/mglm_summarization_preprocessor.py
 create mode 100644 tests/pipelines/test_mglm_text_summarization.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index a671ded5..3951541c 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -82,6 +82,7 @@ class Models(object):
     bert_for_ds = 'bert-for-document-segmentation'
     ponet = 'ponet'
     T5 = 'T5'
+    mglm = 'mglm'
     bloom = 'bloom'
 
     # audio models
@@ -251,6 +252,7 @@ class Pipelines(object):
     relation_extraction = 'relation-extraction'
     document_segmentation = 'document-segmentation'
     feature_extraction = 'feature-extraction'
+    mglm_text_summarization = 'mglm-text-summarization'
     translation_en_to_de = 'translation_en_to_de'  # keep it underscore
     translation_en_to_ro = 'translation_en_to_ro'  # keep it underscore
     translation_en_to_fr = 'translation_en_to_fr'  # keep it underscore
@@ -376,6 +378,7 @@ class Preprocessors(object):
     re_tokenizer = 're-tokenizer'
     document_segmentation = 'document-segmentation'
     feature_extraction = 'feature-extraction'
+    mglm_summarization = 'mglm-summarization'
     sentence_piece = 'sentence-piece'
 
     # audio preprocessor
diff --git a/modelscope/models/nlp/__init__.py b/modelscope/models/nlp/__init__.py
index ccb2d382..1d71469a 100644
--- a/modelscope/models/nlp/__init__.py
+++ b/modelscope/models/nlp/__init__.py
@@ -35,6 +35,7 @@ if TYPE_CHECKING:
         SbertTokenizerFast,
     )
     from .T5 import T5ForConditionalGeneration
+    from .mglm import MGLMForTextSummarization
     from .task_models import (
         FeatureExtractionModel,
         InformationExtractionModel,
@@ -106,6 +107,7 @@ else:
         ],
         'sentence_embedding': ['SentenceEmbedding'],
         'T5': ['T5ForConditionalGeneration'],
+        'mglm': ['MGLMForTextSummarization'],
         'gpt_neo': ['GPTNeoModel'],
         'bloom': ['BloomModel'],
     }
diff --git a/modelscope/models/nlp/mglm/__init__.py b/modelscope/models/nlp/mglm/__init__.py
new file mode 100644
index 00000000..26d1101b
--- /dev/null
+++ b/modelscope/models/nlp/mglm/__init__.py
@@ -0,0 +1,22 @@
+# Modified by Zhipu.AI
+# Original Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .mglm_for_text_summarization import mGlmForSummarization
+else:
+    _import_structure = {
+        'mglm_for_text_summarization': ['MGLMForTextSummarization'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/nlp/mglm/arguments.py b/modelscope/models/nlp/mglm/arguments.py
new file mode 100755
index 00000000..13b3aeab
--- /dev/null
+++ b/modelscope/models/nlp/mglm/arguments.py
@@ -0,0 +1,793 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""argparser configuration"""
+
+import argparse
+import os
+
+import deepspeed
+import json
+import torch
+
+from .utils import get_hostname
+
+
+def add_model_config_args(parser):
+    """Model arguments"""
+
+    group = parser.add_argument_group('model', 'model configuration')
+
+    group.add_argument(
+        '--transformer-xl',
+        action='store_true',
+        help='use transformer-xl for training')
+    group.add_argument(
+        '--pretrained-bert',
+        action='store_true',
+        help='use a pretrained bert-large-uncased model instead'
+        'of initializing from scratch. See '
+        '--tokenizer-model-type to specify which pretrained '
+        'BERT model to use')
+    group.add_argument(
+        '--encoder-decoder',
+        action='store_true',
+        help='use the encoder-decoder architecture for blocklm')
+    group.add_argument(
+        '--attention-dropout',
+        type=float,
+        default=0.1,
+        help='dropout probability for attention weights')
+    group.add_argument(
+        '--num-attention-heads',
+        type=int,
+        default=16,
+        help='num of transformer attention heads')
+    group.add_argument(
+        '--hidden-size', type=int, default=1024, help='tansformer hidden size')
+    group.add_argument(
+        '--intermediate-size',
+        type=int,
+        default=None,
+        help='transformer embedding dimension for FFN'
+        'set to 4*`--hidden-size` if it is None')
+    group.add_argument(
+        '--num-layers', type=int, default=24, help='num decoder layers')
+    group.add_argument(
+        '--layernorm-epsilon',
+        type=float,
+        default=1e-5,
+        help='layer norm epsilon')
+    group.add_argument(
+        '--hidden-dropout',
+        type=float,
+        default=0.1,
+        help='dropout probability for hidden state transformer')
+    group.add_argument(
+        '--output-dropout',
+        type=float,
+        default=0.1,
+        help='dropout probability for pooled output')
+    group.add_argument(
+        '--max-position-embeddings',
+        type=int,
+        default=512,
+        help='maximum number of position embeddings to use')
+    group.add_argument(
+        '--vocab-size',
+        type=int,
+        default=250112,
+        help='vocab size to use for non-character-level '
+        'tokenization. This value will only be used when '
+        'creating a tokenizer')
+    group.add_argument(
+        '--deep-init',
+        action='store_true',
+        help='initialize bert model similar to gpt2 model.'
+        'scales initialization of projection layers by a '
+        'factor of 1/sqrt(2N). Necessary to train bert '
+        'models larger than BERT-Large.')
+    group.add_argument(
+        '--make-vocab-size-divisible-by',
+        type=int,
+        default=128,
+        help='Pad the vocab size to be divisible by this value.'
+        'This is added for computational efficieny reasons.')
+    group.add_argument(
+        '--cpu-optimizer', action='store_true', help='Run optimizer on CPU')
+    group.add_argument(
+        '--cpu_torch_adam',
+        action='store_true',
+        help='Use Torch Adam as optimizer on CPU.')
+
+    return parser
+
+
+def add_fp16_config_args(parser):
+    """Mixed precision arguments."""
+
+    group = parser.add_argument_group('fp16', 'fp16 configurations')
+
+    group.add_argument(
+        '--fp16', action='store_true', help='Run model in fp16 mode')
+    group.add_argument(
+        '--fp32-embedding', action='store_true', help='embedding in fp32')
+    group.add_argument(
+        '--fp32-layernorm', action='store_true', help='layer norm in fp32')
+    group.add_argument(
+        '--fp32-tokentypes',
+        action='store_true',
+        help='embedding token types in fp32')
+    group.add_argument(
+        '--fp32-allreduce', action='store_true', help='all-reduce in fp32')
+    group.add_argument(
+        '--hysteresis',
+        type=int,
+        default=2,
+        help='hysteresis for dynamic loss scaling')
+    group.add_argument(
+        '--loss-scale',
+        type=float,
+        default=None,
+        help='Static loss scaling, positive power of 2 '
+        'values can improve fp16 convergence. If None, dynamic'
+        'loss scaling is used.')
+    group.add_argument(
+        '--loss-scale-window',
+        type=float,
+        default=1000,
+        help='Window over which to raise/lower dynamic scale')
+    group.add_argument(
+        '--min-scale',
+        type=float,
+        default=1,
+        help='Minimum loss scale for dynamic loss scale')
+    group.add_argument('--attention-scale', type=float, default=1.0)
+    return parser
+
+
+def add_training_args(parser):
+    """Training arguments."""
+
+    group = parser.add_argument_group('train', 'training configurations')
+
+    group.add_argument(
+        '--experiment-name',
+        type=str,
+        default='gpt-345M',
+        help='The experiment name for summary and checkpoint')
+    group.add_argument(
+        '--batch-size', type=int, default=4, help='Data Loader batch size')
+    group.add_argument(
+        '--gradient-accumulation-steps',
+        type=int,
+        default=1,
+        help='Data Loader batch size')
+    group.add_argument(
+        '--weight-decay',
+        type=float,
+        default=0.01,
+        help='weight decay coefficient for L2 regularization')
+    group.add_argument(
+        '--checkpoint-activations',
+        action='store_true',
+        help='checkpoint activation to allow for training '
+        'with larger models and sequences')
+    group.add_argument(
+        '--checkpoint-num-layers',
+        type=int,
+        default=1,
+        help='chunk size (number of layers) for checkpointing')
+    group.add_argument(
+        '--deepspeed-activation-checkpointing',
+        action='store_true',
+        help='uses activation checkpointing from deepspeed')
+    group.add_argument(
+        '--epochs',
+        type=int,
+        default=None,
+        help='Number of finetunning epochs. Zero results in evaluation only.')
+    group.add_argument(
+        '--clip-grad', type=float, default=1.0, help='gradient clipping')
+    group.add_argument(
+        '--train-iters',
+        type=int,
+        default=0,
+        help='total number of iterations to train over all training runs')
+    group.add_argument('--label-smoothing', type=float, default=0.0)
+    group.add_argument(
+        '--log-interval', type=int, default=100, help='report interval')
+    group.add_argument(
+        '--summary-dir',
+        type=str,
+        default='',
+        help='The directory to store the summary')
+    group.add_argument('--seed', type=int, default=1234, help='random seed')
+    # Batch producer arguments
+    group.add_argument(
+        '--reset-position-ids',
+        action='store_true',
+        help='Reset posistion ids after end-of-document token.')
+    group.add_argument(
+        '--reset-attention-mask',
+        action='store_true',
+        help='Reset self attention maske after '
+        'end-of-document token.')
+
+    # Learning rate.
+    group.add_argument(
+        '--lr-decay-iters',
+        type=int,
+        default=None,
+        help='number of iterations to decay LR over,'
+        ' If None defaults to `--train-iters`*`--epochs`')
+    group.add_argument(
+        '--lr-decay-style',
+        type=str,
+        default='linear',
+        choices=['constant', 'linear', 'cosine', 'exponential'],
+        help='learning rate decay function')
+    group.add_argument('--lr-decay-ratio', type=float, default=0.1)
+    group.add_argument(
+        '--lr', type=float, default=1.0e-4, help='initial learning rate')
+    group.add_argument(
+        '--warmup',
+        type=float,
+        default=0.01,
+        help='percentage of data to warmup on (.01 = 1% of all '
+        'training iters). Default 0.01')
+    group.add_argument(
+        '--switch-linear',
+        action='store_true',
+        help='Switch to linear decay for cosine decay')
+    # model checkpointing
+    group.add_argument(
+        '--save',
+        type=str,
+        default=None,
+        help='Output directory to save checkpoints to.')
+    group.add_argument('--new-save-directory', action='store_true')
+    group.add_argument(
+        '--save-epoch',
+        type=int,
+        default=1,
+        help='number of epochs between saves')
+    group.add_argument(
+        '--save-interval',
+        type=int,
+        default=5000,
+        help='number of iterations between saves')
+    group.add_argument(
+        '--no-save-optim',
+        action='store_true',
+        help='Do not save current optimizer.')
+    group.add_argument(
+        '--no-save-rng',
+        action='store_true',
+        help='Do not save current rng state.')
+    group.add_argument(
+        '--load',
+        type=str,
+        default=None,
+        help='Path to a directory containing a model checkpoint.')
+    group.add_argument(
+        '--no-load-optim',
+        action='store_true',
+        help='Do not load optimizer when loading checkpoint.')
+    group.add_argument(
+        '--no-load-rng',
+        action='store_true',
+        help='Do not load rng state when loading checkpoint.')
+    group.add_argument(
+        '--no-load-lr-scheduler',
+        action='store_true',
+        help='Do not load lr scheduler when loading checkpoint.')
+    group.add_argument(
+        '--no-deepspeed-load',
+        action='store_true',
+        help='Not use deepspeed when loading checkpoint')
+    group.add_argument(
+        '--finetune',
+        action='store_true',
+        help='Load model for finetuning. Do not load optimizer '
+        'or rng state from checkpoint and set iteration to 0. '
+        'Assumed when loading a release checkpoint.')
+    group.add_argument(
+        '--resume-dataloader',
+        action='store_true',
+        help='Resume the dataloader when resuming training. '
+        'Does not apply to tfrecords dataloader, try resuming'
+        'with a different seed in this case.')
+    # distributed training args
+    group.add_argument(
+        '--distributed-backend',
+        default='nccl',
+        help=
+        'which backend to use for distributed training. One of [gloo, nccl]',
+        choices=['nccl', 'gloo'])
+    group.add_argument(
+        '--DDP-impl',
+        default='torch',
+        choices=['local', 'torch', 'none'],
+        help='which DistributedDataParallel implementation to use.')
+
+    group.add_argument(
+        '--local_rank',
+        type=int,
+        default=None,
+        help='local rank passed from distributed launcher')
+    # BlockLM training args
+    group.add_argument(
+        '--block-lm',
+        action='store_true',
+        help='whether use the BlockLM pre-training')
+    group.add_argument(
+        '--masked-lm',
+        action='store_true',
+        help='whether to use the mlm objective')
+    group.add_argument('--bert-prob', type=float, default=0.5)
+    group.add_argument('--gpt-infill-prob', type=float, default=0.5)
+    group.add_argument('--gpt-min-ratio', type=float, default=0.5)
+    group.add_argument('--gap-sentence-prob', type=float, default=0.0)
+    group.add_argument('--gap-sentence-ratio', type=float, default=0.15)
+    group.add_argument('--avg-block-length', type=int, default=3)
+    group.add_argument('--short-seq-prob', type=float, default=0.0)
+    group.add_argument('--single-span-prob', type=float, default=0.0)
+    group.add_argument(
+        '--task-mask',
+        action='store_true',
+        help='Use different mask for generation and blank filling')
+    group.add_argument(
+        '--no-shuffle-block',
+        action='store_true',
+        help='not shuffle the blocks when filling the blank')
+    group.add_argument(
+        '--no-block-position',
+        action='store_true',
+        help='Use (rough) absolute positions instead of block positions')
+    group.add_argument(
+        '--sentinel-token',
+        action='store_true',
+        help='Use sentinel (mask) tokens to replace 2d position encoding')
+    group.add_argument('--block-mask-prob', type=float, default=0.0)
+    group.add_argument('--context-mask-ratio', type=float, default=0.0)
+    group.add_argument(
+        '--random-position',
+        action='store_true',
+        help='Use random start position to cover all the position embeddings')
+    return parser
+
+
+def add_evaluation_args(parser):
+    """Evaluation arguments."""
+
+    group = parser.add_argument_group('validation',
+                                      'validation configurations')
+
+    group.add_argument(
+        '--eval-batch-size',
+        type=int,
+        default=None,
+        help='Data Loader batch size for evaluation datasets.'
+        'Defaults to `--batch-size`')
+    group.add_argument(
+        '--eval-iters',
+        type=int,
+        default=100,
+        help='number of iterations to run for evaluation'
+        'validation/test for')
+    group.add_argument(
+        '--eval-interval',
+        type=int,
+        default=1000,
+        help='interval between running evaluation on validation set')
+    group.add_argument(
+        '--eval-epoch',
+        type=int,
+        default=1,
+        help='epoch between running evaluation on validation set')
+    group.add_argument(
+        '--eval-seq-length',
+        type=int,
+        default=None,
+        help='Maximum sequence length to process for '
+        'evaluation. Defaults to `--seq-length`')
+    group.add_argument(
+        '--eval-max-preds-per-seq',
+        type=int,
+        default=None,
+        help='Maximum number of predictions to use for '
+        'evaluation. Defaults to '
+        'math.ceil(`--eval-seq-length`*.15/10)*10')
+    group.add_argument('--overlapping-eval', type=int, default=32)
+
+    return parser
+
+
+def add_text_generate_args(parser):
+    """Text generate arguments."""
+
+    group = parser.add_argument_group('Text generation', 'configurations')
+    group.add_argument('--temperature', type=float, default=1.0)
+    group.add_argument('--top_p', type=float, default=0.0)
+    group.add_argument('--top_k', type=int, default=0)
+    group.add_argument('--out-seq-length', type=int, default=256)
+    group.add_argument('--num-beams', type=int, default=1)
+    group.add_argument('--length-penalty', type=float, default=0.0)
+    group.add_argument('--no-repeat-ngram-size', type=int, default=0)
+    group.add_argument('--min-tgt-length', type=int, default=0)
+    group.add_argument('--select-topk', action='store_true')
+    group.add_argument('--blank-maskratio', type=float, default=0.1)
+    return parser
+
+
+def add_data_args(parser):
+    """Train/valid/test data arguments."""
+
+    group = parser.add_argument_group('data', 'data configurations')
+
+    group.add_argument(
+        '--model-parallel-size',
+        type=int,
+        default=1,
+        help='size of the model parallel.')
+    group.add_argument(
+        '--shuffle',
+        action='store_true',
+        help='Shuffle data. Shuffling is deterministic '
+        'based on seed and current epoch.')
+    group.add_argument('--filter-english', action='store_true')
+    group.add_argument(
+        '--train-data',
+        nargs='+',
+        default=None,
+        help='Whitespace separated filenames or corpora names '
+        'for training.')
+    group.add_argument(
+        '--valid-data',
+        nargs='*',
+        default=None,
+        help="""Filename for validation data.""")
+    group.add_argument(
+        '--test-data',
+        nargs='*',
+        default=None,
+        help="""Filename for testing""")
+    group.add_argument(
+        '--data-dir',
+        type=str,
+        default=None,
+        help='The data path to all the data files')
+    group.add_argument(
+        '--input-data-sizes-file',
+        type=str,
+        default='sizes.txt',
+        help='the filename containing all the shards sizes')
+
+    group.add_argument(
+        '--delim', default=',', help='delimiter used to parse csv data files')
+    group.add_argument(
+        '--text-key',
+        default='sentence',
+        help='key to use to extract text from json/csv')
+    group.add_argument(
+        '--eval-text-key',
+        default=None,
+        help='key to use to extract text from '
+        'json/csv evaluation datasets')
+    group.add_argument(
+        '--split',
+        default='1000,1,1',
+        help='comma-separated list of proportions for training,'
+        ' validation, and test split')
+
+    group.add_argument(
+        '--no-lazy-loader',
+        action='store_true',
+        help='whether to lazy read the data set')
+    group.add_argument('--half-lazy-loader', action='store_true')
+    group.add_argument(
+        '--loader-scatter',
+        type=int,
+        default=None,
+        help='Number of scatters to use for dataloaders')
+    group.add_argument(
+        '--loose-json',
+        action='store_true',
+        help='Use loose json (one json-formatted string per '
+        'newline), instead of tight json (data file is one '
+        'json string)')
+    group.add_argument(
+        '--presplit-sentences',
+        action='store_true',
+        help='Dataset content consists of documents where '
+        'each document consists of newline separated sentences')
+    group.add_argument(
+        '--num-workers',
+        type=int,
+        default=2,
+        help="""Number of workers to use for dataloading""")
+    group.add_argument(
+        '--tokenizer-model-type',
+        type=str,
+        default=None,
+        help="Model type to use for sentencepiece tokenization \
+                       (one of ['bpe', 'char', 'unigram', 'word']) or \
+                       bert vocab to use for BertWordPieceTokenizer (one of \
+                       ['bert-large-uncased', 'bert-large-cased', etc.])")
+    group.add_argument(
+        '--tokenizer-path',
+        type=str,
+        default='tokenizer.model',
+        help='path used to save/load sentencepiece tokenization '
+        'models')
+    group.add_argument(
+        '--tokenizer-type',
+        type=str,
+        default='BertWordPieceTokenizer',
+        choices=[
+            'CharacterLevelTokenizer', 'SentencePieceTokenizer',
+            'BertWordPieceTokenizer', 'GPT2BPETokenizer', 'ChineseSPTokenizer'
+        ],
+        help='what type of tokenizer to use')
+    group.add_argument('--no-pre-tokenize', action='store_true')
+    group.add_argument(
+        '--cache-dir',
+        default=None,
+        type=str,
+        help='Where to store pre-trained BERT downloads')
+    group.add_argument(
+        '--use-tfrecords',
+        action='store_true',
+        help='load `--train-data`, `--valid-data`, '
+        '`--test-data` from BERT tf records instead of '
+        'normal data pipeline')
+    group.add_argument(
+        '--seq-length',
+        type=int,
+        default=512,
+        help='Maximum sequence length to process')
+    group.add_argument(
+        '--mem-length',
+        type=int,
+        default=0,
+        help='The memory length to preserve')
+    group.add_argument(
+        '--max-preds-per-seq',
+        type=int,
+        default=None,
+        help='Maximum number of predictions to use per sequence.'
+        'Defaults to math.ceil(`--seq-length`*.15/10)*10.'
+        'MUST BE SPECIFIED IF `--use-tfrecords` is True.')
+    group.add_argument('--non-sentence-start', type=float, default=0.0)
+    group.add_argument(
+        '--sample-one-document',
+        action='store_true',
+        help='only sample one document in one sample')
+    group.add_argument(
+        '--load-splits',
+        type=str,
+        default=None,
+        help='The path to load split indices from')
+    group.add_argument(
+        '--save-splits',
+        type=str,
+        default=None,
+        help='The path to save split indices to')
+    group.add_argument(
+        '--save-test-data',
+        type=str,
+        default=None,
+        help='The path to save the test data')
+    group.add_argument(
+        '--multi-task-data',
+        nargs='*',
+        default=None,
+        help='Downsteam task names for multi-task pre-training')
+    group.add_argument(
+        '--multi-task-ratio',
+        type=float,
+        default=0.0,
+        help='Ratio for multi-task pre-training')
+    group.add_argument('--multi-seq-length', type=int, default=None)
+    group.add_argument('--multi-batch-size', type=int, default=None)
+    return parser
+
+
+def add_finetune_config_args(parser):
+    group = parser.add_argument_group('finetune', 'finetune configurations')
+    group.add_argument('--task', type=str, help='Task name.')
+    group.add_argument(
+        '--load-pretrained',
+        type=str,
+        help='Load pretrained model',
+        default=None)
+    group.add_argument(
+        '--pool-token',
+        type=str,
+        choices=['start', 'pad', 'cls'],
+        help='The token to pool the sequence representation',
+        default='cls')
+    group.add_argument(
+        '--cloze-eval',
+        action='store_true',
+        help='Evaluation dataset with cloze task')
+    group.add_argument(
+        '--multi-token',
+        action='store_true',
+        help='Use multi token for cloze evaluation')
+    group.add_argument(
+        '--segment-length',
+        type=int,
+        default=0,
+        help='The maximum segment length for cloze evaluation')
+    group.add_argument(
+        '--loss-func',
+        type=str,
+        choices=['cross_entropy', 'hinge', 'generative', 'mix'],
+        default='cross_entropy')
+    group.add_argument('--block-lm-ratio', type=float, default=0.0)
+    group.add_argument(
+        '--adapet',
+        action='store_true',
+        help='Use the decoupled cross entropy loss in AdaPET')
+    group.add_argument('--pattern-id', type=int, default=0)
+    group.add_argument(
+        '--fast-decode',
+        action='store_true',
+        help=
+        'Fast decode for multi-token cloze. Can only be used without checkpoint activation.'
+    )
+    group.add_argument('--few-superglue', action='store_true')
+    group.add_argument(
+        '--eval-valid',
+        action='store_true',
+        help='Whether evaluate on the valid set')
+    group.add_argument('--validation-metric', type=str, default=None)
+    group.add_argument(
+        '--unidirectional',
+        action='store_true',
+        help='Use the left to right language model')
+    group.add_argument('--src-seq-length', type=int, default=None)
+    group.add_argument('--tgt-seq-length', type=int, default=None)
+    group.add_argument('--adam-beta1', type=float, default=0.9)
+    group.add_argument('--adam-beta2', type=float, default=0.999)
+    group.add_argument('--adam-eps', type=float, default=1e-8)
+    group.add_argument(
+        '--optimizer', type=str, choices=['adam', 'adafactor'], default='adam')
+    group.add_argument('--wsc-negative', action='store_true')
+    group.add_argument('--overwrite', action='store_true')
+    group.add_argument('--no-validation', action='store_true')
+    # Continuous prompt arguments
+    group.add_argument(
+        '--continuous-prompt',
+        action='store_true',
+        help='Use continuous prompt for PET')
+    group.add_argument('--num-prompt-tokens', type=int, default=0)
+    group.add_argument(
+        '--prompt-func', default='lstm', choices=['lstm', 'mlp', 'none'])
+    group.add_argument(
+        '--freeze-transformer', action='store_true', default=False)
+    group.add_argument('--tune-prefix-layers', type=int, default=None)
+    group.add_argument('--prefix-prompt', type=int, default=0)
+    group.add_argument('--prompt-init', action='store_true', default=False)
+    return parser
+
+
+def get_args():
+    """Parse all the args."""
+
+    parser = argparse.ArgumentParser(description='PyTorch BERT Model')
+    parser = add_model_config_args(parser)
+    parser = add_fp16_config_args(parser)
+    parser = add_training_args(parser)
+    parser = add_evaluation_args(parser)
+    parser = add_text_generate_args(parser)
+    parser = add_data_args(parser)
+    parser = add_finetune_config_args(parser)
+
+    # Include DeepSpeed configuration arguments
+    parser = deepspeed.add_config_arguments(parser)
+
+    args = parser.parse_args(args=[])
+    if not args.train_data and not args.data_dir:
+        print('WARNING: No training data specified')
+
+    args.cuda = torch.cuda.is_available()
+
+    args.rank = int(os.getenv('RANK', '0'))
+    args.world_size = int(os.getenv('WORLD_SIZE', '1'))
+    if hasattr(args, 'deepspeed_mpi') and args.deepspeed_mpi:
+        mpi_define_env(args)
+    elif os.getenv('OMPI_COMM_WORLD_LOCAL_RANK'):
+        # We are using (OpenMPI) mpirun for launching distributed data parallel processes
+        local_rank = int(os.getenv('OMPI_COMM_WORLD_LOCAL_RANK'))
+        local_size = int(os.getenv('OMPI_COMM_WORLD_LOCAL_SIZE'))
+
+        # Possibly running with Slurm
+        num_nodes = int(os.getenv('SLURM_JOB_NUM_NODES', '1'))
+        nodeid = int(os.getenv('SLURM_NODEID', '0'))
+
+        args.local_rank = local_rank
+        args.rank = nodeid * local_size + local_rank
+        args.world_size = num_nodes * local_size
+
+    args.model_parallel_size = min(args.model_parallel_size, args.world_size)
+    if args.rank == 0:
+        print('using world size: {} and model-parallel size: {} '.format(
+            args.world_size, args.model_parallel_size))
+
+    args.dynamic_loss_scale = False
+    if args.loss_scale is None:
+        args.dynamic_loss_scale = True
+        if args.rank == 0:
+            print(' > using dynamic loss scaling')
+
+    # The args fp32_* or fp16_* meant to be active when the
+    # args fp16 is set. So the default behaviour should all
+    # be false.
+    if not args.fp16:
+        args.fp32_embedding = False
+        args.fp32_tokentypes = False
+        args.fp32_layernorm = False
+
+    if hasattr(args, 'deepspeed'
+               ) and args.deepspeed and args.deepspeed_config is not None:
+        with open(args.deepspeed_config) as file:
+            deepspeed_config = json.load(file)
+        if 'train_micro_batch_size_per_gpu' in deepspeed_config:
+            args.batch_size = deepspeed_config[
+                'train_micro_batch_size_per_gpu']
+        if 'gradient_accumulation_steps' in deepspeed_config:
+            args.gradient_accumulation_steps = deepspeed_config[
+                'gradient_accumulation_steps']
+        else:
+            args.gradient_accumulation_steps = 1
+        if 'optimizer' in deepspeed_config:
+            optimizer_params_config = deepspeed_config['optimizer'].get(
+                'params', {})
+            args.lr = optimizer_params_config.get('lr', args.lr)
+            args.weight_decay = optimizer_params_config.get(
+                'weight_decay', args.weight_decay)
+    return args
+
+
+def mpi_define_env(args):
+    from mpi4py import MPI
+    comm = MPI.COMM_WORLD
+    rank = comm.Get_rank()
+    world_size = comm.Get_size()
+
+    master_addr = None
+    if rank == 0:
+        master_addr = get_hostname()
+    master_addr = comm.bcast(master_addr, root=0)
+
+    # Determine local rank by assuming hostnames are unique
+    proc_name = MPI.Get_processor_name()
+    all_procs = comm.allgather(proc_name)
+    local_rank = sum([i == proc_name for i in all_procs[:rank]])
+
+    os.environ['RANK'] = str(rank)
+    os.environ['WORLD_SIZE'] = str(world_size)
+    args.local_rank = local_rank
+    args.world_size = world_size
+    args.rank = rank
+    os.environ['MASTER_ADDR'] = master_addr
+    os.environ[
+        'MASTER_PORT'] = '29500'  # TORCH_DISTRIBUTED_DEFAULT_PORT = 29500
+
+    print(
+        'Discovered MPI settings of world_rank={}, local_rank={}, world_size={}, master_addr={}, master_port={}'
+        .format(os.environ['RANK'], args.local_rank, os.environ['WORLD_SIZE'],
+                os.environ['MASTER_ADDR'], os.environ['MASTER_PORT']))
diff --git a/modelscope/models/nlp/mglm/blocklm_utils.py b/modelscope/models/nlp/mglm/blocklm_utils.py
new file mode 100644
index 00000000..9af83f67
--- /dev/null
+++ b/modelscope/models/nlp/mglm/blocklm_utils.py
@@ -0,0 +1,625 @@
+# Copyright (c) 2022 Zhipu.AI
+
+import copy
+import math
+import random
+
+import numpy as np
+import torch
+import torch.utils.data
+from scipy.stats import poisson
+
+from . import mpu
+from .utils import print_rank_0
+
+
+def rindex(lst, val, start=None):
+    if start is None:
+        start = len(lst) - 1
+    for i in range(start, -1, -1):
+        if lst[i] == val:
+            return i
+    return -1
+
+
+def index_in_list(lst, val, start=None):
+    if start is None:
+        start = 0
+    for i in range(start, len(lst)):
+        if lst[i] == val:
+            return i
+    return -1
+
+
+class ConstructBlockStrategy:
+
+    def __init__(self,
+                 args,
+                 tokenizer,
+                 max_seq_length,
+                 bert_prob=1.0,
+                 gap_sentence_prob=0.0,
+                 gpt_infill_prob=0.5,
+                 gpt_min_ratio=0.5,
+                 bert_ratio=0.15,
+                 gap_sentence_ratio=0.15,
+                 average_block_length=3,
+                 max_block_length=40,
+                 block_mask_prob=0.0,
+                 context_mask_ratio=0.0,
+                 context_mask_range=3,
+                 short_seq_prob=0.0,
+                 single_span_prob=0.0,
+                 block_position_encoding=True,
+                 encoder_decoder=False,
+                 shuffle_blocks=True,
+                 sentinel_token=False,
+                 task_mask=False,
+                 random_position=False,
+                 masked_lm=False):
+        self.eod_token = args.eod_token
+        self.tokenizer = tokenizer
+        self.count = 0
+        self.max_seq_length = max_seq_length
+        self.rank = mpu.get_data_parallel_rank()
+        self.world_size = mpu.get_data_parallel_world_size()
+        # self.rank = 0
+        # self.world_size = 1
+        assert 0.0 <= bert_prob <= 1.0
+        self.bert_prob = bert_prob
+        self.gap_sentence_prob = gap_sentence_prob
+        self.gpt_prob = 1 - bert_prob - gap_sentence_prob
+        assert self.gpt_prob >= -1e-10
+        self.infill_prob = gpt_infill_prob
+        self.gpt_min_ratio = gpt_min_ratio
+        self.bert_ratio = bert_ratio
+        self.gap_sentence_ratio = gap_sentence_ratio
+        self.block_length_distribution = [
+            poisson.pmf(i, average_block_length)
+            for i in range(1, max_block_length)
+        ]
+        self.block_mask_prob = block_mask_prob
+        self.context_mask_ratio = context_mask_ratio
+        self.context_mask_range = context_mask_range
+        self.short_seq_prob = short_seq_prob
+        self.single_span_prob = single_span_prob
+        self.block_position_encoding = block_position_encoding
+        self.encoder_decoder = encoder_decoder
+        self.shuffle_blocks = shuffle_blocks
+        self.sentinel_token = sentinel_token
+        self.generation_mask = 'gMASK' if task_mask else 'MASK'
+        self.generation_mask = self.tokenizer.get_command(
+            self.generation_mask).Id
+        self.gap_sentence_mask = 'sMASK' if task_mask else 'MASK'
+        self.gap_sentence_mask = self.tokenizer.get_command(
+            self.gap_sentence_mask).Id
+        self.random_position = random_position
+        self.masked_lm = masked_lm
+        print_rank_0(
+            f'BERT prob {self.bert_prob}, gap sent prob {self.gap_sentence_prob}, GPT prob {self.gpt_prob}, infill prob {self.infill_prob}'  # noqa
+        )
+        print_rank_0(
+            f'generation min ratio {self.gpt_min_ratio}, block ratio {self.bert_ratio}, gap sent ratio {self.gap_sentence_ratio}'  # noqa
+        )
+        print_rank_0(
+            f'block length distribution {self.block_length_distribution}')
+        print_rank_0(
+            f'block mask prob {self.block_mask_prob}, context mask ratio {self.context_mask_ratio}'
+        )
+
+    def contains_sentence_end(self, tok):
+        tok = self.tokenizer.IdToToken(tok)
+        if '.' in tok:
+            return True
+        if '?' in tok:
+            return True
+        if '!' in tok:
+            return True
+        if ';' in tok:
+            return True
+        if ':' in tok:
+            return True
+        if '。' in tok:
+            return True
+        if '？' in tok:
+            return True
+        if '！' in tok:
+            return True
+        if '；' in tok:
+            return True
+        if '…' in tok:
+            return True
+        if '\n' in tok:
+            return True
+        return False
+
+    @staticmethod
+    def sample_spans(span_lengths, total_length, rng, offset=0):
+        blank_length = total_length - sum(span_lengths)
+        m = blank_length - len(span_lengths) + 1
+        places = [rng.randrange(m + 1) for _ in range(len(span_lengths))]
+        places.sort()
+        spans = []
+        for place, span_length in zip(places, span_lengths):
+            start = offset + place
+            end = offset + place + span_length
+            spans.append((start, end))
+            offset += span_length + 1
+        return spans
+
+    def sample_span_in_document(self, tokens, masked_lengths, rng):
+        rng.shuffle(masked_lengths)
+        mask_spans = []
+        mask_index = 0
+        indices = [-1] + np.where(tokens == self.eod_token)[0].tolist()
+        last_index = len(tokens)
+        documents = []
+        for index in reversed(indices):
+            start_index = index
+            if start_index + 1 < len(tokens) and tokens[
+                    start_index + 1] == self.tokenizer.get_command('ENC').Id:
+                start_index += 1
+            length = last_index - start_index - 1
+            if last_index == len(tokens) and length > 0:
+                length -= 1
+            documents.append((start_index + 1, length))
+            last_index = index
+        documents.sort(key=lambda x: x[1])
+        for i, (offset, length) in enumerate(documents):
+            if i == len(documents) - 1:
+                current_masked_length, current_count = 0, 0
+                while mask_index + current_count < len(
+                        masked_lengths
+                ) and masked_lengths[
+                        mask_index +  # noqa
+                        current_count] + current_masked_length + current_count <= length:
+                    current_masked_length += masked_lengths[mask_index
+                                                            + current_count]
+                    current_count += 1
+                if current_count > 0:
+                    spans = self.sample_spans(
+                        masked_lengths[mask_index:mask_index + current_count],
+                        length,
+                        rng,
+                        offset=offset)
+                    mask_spans += spans
+                if mask_index + current_count < len(masked_lengths) - 1:
+                    print(length, masked_lengths[mask_index:],
+                          masked_lengths[:mask_index], indices)
+            else:
+                current_masked_total = int(length * self.bert_ratio)
+                current_masked_length, current_count = 0, 0
+                while mask_index + current_count < len(
+                        masked_lengths
+                ) and masked_lengths[
+                        mask_index +  # noqa
+                        current_count] + current_masked_length <= current_masked_total:
+                    current_masked_length += masked_lengths[mask_index
+                                                            + current_count]
+                    current_count += 1
+                if current_count > 0:
+                    spans = self.sample_spans(
+                        masked_lengths[mask_index:mask_index + current_count],
+                        length,
+                        rng,
+                        offset=offset)
+                    mask_spans += spans
+                    mask_index += current_count
+        return mask_spans
+
+    def make_masked_data(self,
+                         tokens,
+                         loss_masks,
+                         attention_mask,
+                         block_spans,
+                         rng,
+                         task='bert'):
+        position_ids = np.arange(len(tokens), dtype=np.long)
+        targets = copy.deepcopy(tokens)
+        mask_id = self.tokenizer.get_command('MASK').Id
+        mlm_masks = np.zeros(len(tokens), dtype=np.long)
+        for start, end in block_spans:
+            for idx in range(start, end):
+                tokens[idx] = mask_id
+            mlm_masks[start:end] = 1
+        loss_masks = loss_masks * mlm_masks
+        return tokens, targets, loss_masks, position_ids
+
+    def make_block_data(self,
+                        tokens,
+                        loss_masks,
+                        attention_mask,
+                        block_spans,
+                        rng,
+                        task='bert'):
+        text_length = len(tokens)
+        position_ids = np.ones(len(tokens), dtype=np.long)
+        for start, end in block_spans:
+            position_ids[start + 1:end] = 0
+        position_ids = np.cumsum(position_ids) - 1
+        if self.random_position and position_ids[-1] < self.max_seq_length - 1:
+            position_bias = self.max_seq_length - position_ids[-1]
+            position_bias = rng.randrange(0, position_bias)
+            position_ids = position_ids + position_bias
+        if self.encoder_decoder or not self.shuffle_blocks:
+            block_spans.sort(key=lambda x: x[0])
+        else:
+            rng.shuffle(block_spans)
+        if self.sentinel_token:
+            block_spans = [(start, end, idx)
+                           for idx, (start, end) in enumerate(block_spans)]
+        else:
+            block_spans = [(start, end, 0) for start, end in block_spans]
+        target_tokens, target_position_ids, target_block_position_ids, targets = [], [], [], []
+        for start, end, idx in block_spans:
+            sop_token = 'sop' if idx == 0 else f'sop{idx}'
+            target_tokens.append([self.tokenizer.get_command(sop_token).Id])
+            span_tokens = copy.deepcopy(tokens[start:end])
+            if self.block_mask_prob > 0.0 and task == 'bert':
+                for sub_idx in range(len(span_tokens)):
+                    if random.random() < self.block_mask_prob:
+                        span_tokens[sub_idx] = self.tokenizer.get_command(
+                            'dBLOCK').Id
+            target_tokens.append(span_tokens)
+            targets.append(tokens[start:end])
+            targets.append([self.tokenizer.get_command('eop').Id])
+            if not self.sentinel_token:
+                target_position_id = position_ids[start:end]
+                target_position_ids.append(target_position_id)
+                target_position_ids.append([target_position_id[0]])
+            else:
+                target_position_ids.append([self.max_seq_length] *  # noqa
+                                           (end - start + 1))
+            if self.block_position_encoding:
+                target_block_position_ids.append(
+                    np.arange(1, end - start + 2, dtype=np.long))
+            else:
+                target_block_position_ids.append([1] * (end - start + 1))
+        block_spans.sort(key=lambda x: x[0])
+        source_tokens, source_position_ids, local_spans = [], [], []
+        last, current_length = 0, 0
+        for start, end, idx in block_spans:
+            if task == 'generation':
+                mask_id = self.generation_mask
+            elif task == 'gap_sentence':
+                mask_id = self.gap_sentence_mask
+            else:
+                mask_token = 'MASK' if idx == 0 else f'MASK{idx}'
+                mask_id = self.tokenizer.get_command(mask_token).Id
+            local_spans.append((current_length, current_length + start - last))
+            source_tokens.append(tokens[last:start])
+            source_tokens.append([mask_id])
+            source_position_ids.append(position_ids[last:start])
+            source_position_ids.append([position_ids[start]])
+            current_length += start - last + 1
+            last = end
+        if last < len(tokens):
+            local_spans.append(
+                (current_length, current_length + len(tokens) - last))
+            source_tokens.append(tokens[last:])
+            source_position_ids.append(position_ids[last:])
+        source_length = sum(map(len, source_tokens))
+        if attention_mask is not None:
+            assert source_length == attention_mask
+        if target_tokens and self.eod_token in np.concatenate(
+                target_tokens).tolist():
+            print('Found EOS in target', self.tokenizer.DecodeIds(tokens))
+            raise RuntimeError
+        if self.encoder_decoder:
+            target_tokens = target_tokens + [
+                self.tokenizer.get_command('eop').Id
+            ]
+            loss_masks = np.ones(len(target_tokens), dtype=np.long)
+            return source_tokens, target_tokens, loss_masks
+        else:
+            tokens = np.concatenate(source_tokens + target_tokens)
+            if task == 'bert' and self.context_mask_ratio > 0:
+                mask_candidates = set()
+                for start, end in local_spans:
+                    if start != 0:
+                        local_end = min(end, start + self.context_mask_range)
+                        mask_candidates.update(range(start, local_end))
+                    if end != 0:
+                        local_start = max(start, end - self.context_mask_range)
+                        mask_candidates.update(range(local_start, end))
+                mask_pos = rng.sample(
+                    mask_candidates,
+                    int(self.context_mask_ratio * text_length))
+                for pos in mask_pos:
+                    tokens[pos] = self.tokenizer.get_command('dBLOCK').Id
+            targets = np.concatenate(source_tokens + targets)
+            loss_masks = np.ones(len(tokens), dtype=np.long)
+            loss_masks[:source_length] = 0
+            position_ids = np.concatenate(source_position_ids
+                                          + target_position_ids)
+            block_position_ids = np.concatenate(
+                [np.zeros(source_length, dtype=np.long)]
+                + target_block_position_ids)
+            position_ids = np.stack([position_ids, block_position_ids], axis=0)
+            if attention_mask is not None:
+                return tokens, targets, loss_masks, position_ids
+            else:
+                return tokens, targets, loss_masks, position_ids, source_length
+
+    def generate_blank_data(self,
+                            sample,
+                            masked_lengths,
+                            attention_mask,
+                            rng,
+                            task='bert'):
+        rng.shuffle(masked_lengths)
+        tokens, loss_masks = sample['text'], sample['loss_mask']
+        assert tokens[0] == self.tokenizer.get_command('ENC').Id
+        block_spans = self.sample_span_in_document(tokens, masked_lengths, rng)
+        if len(block_spans) < len(masked_lengths):
+            return None
+        if self.masked_lm:
+            data = self.make_masked_data(tokens, loss_masks, attention_mask,
+                                         block_spans, rng)
+        else:
+            data = self.make_block_data(
+                tokens,
+                loss_masks,
+                attention_mask,
+                block_spans,
+                rng,
+                task=task)
+        return data
+
+    def split_samples(self, samples, rng):
+        target_length = rng.randrange(32, self.max_seq_length - 1)
+        num_splits = (self.max_seq_length - 1) // target_length
+        new_samples = []
+        cls_id = self.tokenizer.get_command('ENC').Id
+        eos_id = self.tokenizer.get_command('eos').Id
+        for sample in samples:
+            tokens, loss_masks = sample['text'][1:], sample['loss_mask'][1:]
+            for _ in range(num_splits):
+                if target_length >= len(tokens):
+                    new_tokens, new_loss_masks = tokens, loss_masks
+                else:
+                    random_start = rng.randrange(0,
+                                                 len(tokens) - target_length)
+                    while random_start > 0 and (
+                            tokens[random_start] == eos_id or  # noqa
+                            not (self.contains_sentence_end(  # noqa
+                                tokens[random_start - 1]) or  # noqa
+                                 tokens[random_start - 1] == eos_id)):  # noqa
+                        random_start -= 1
+                    random_end = random_start + target_length
+                    while random_end > random_start and not (
+                            self.contains_sentence_end(tokens[random_end - 1])
+                            or tokens[random_end - 1] == eos_id):
+                        random_end -= 1
+                    if random_end - random_start < target_length // 2:
+                        random_end = random_start + target_length
+                    new_tokens, new_loss_masks = tokens[
+                        random_start:random_end], loss_masks[
+                            random_start:random_end]
+                new_tokens = np.concatenate(([cls_id], new_tokens))
+                new_loss_masks = np.concatenate(([0], new_loss_masks))
+                new_samples.append({
+                    'text': new_tokens,
+                    'loss_mask': new_loss_masks
+                })
+        return new_samples
+
+    def construct_blocks(self, samples):
+        worker_info = torch.utils.data.get_worker_info()
+        if worker_info is not None:
+            worker_id, num_workers = worker_info.id, worker_info.num_workers
+        else:
+            worker_id, num_workers = 0, 1
+        rng = random.Random((self.count * num_workers + worker_id)
+                            * self.world_size + self.rank)
+        self.count += 1
+        token_batch, target_batch, loss_mask_batch, position_id_batch = [], [], [], []
+        source_batch, target_batch = [], []
+        if rng.random() < self.short_seq_prob:
+            samples = self.split_samples(samples, rng)
+        rand = rng.random()
+        single_span = rand < self.single_span_prob
+        rand = 0.0 if single_span else rng.random()
+        attention_mask = []
+        if rand < self.bert_prob:
+            mode = 'bert'
+            for sample in samples:
+                if single_span:
+                    masked_lengths = [
+                        rng.choices(
+                            range(1,
+                                  len(self.block_length_distribution) + 1),
+                            weights=self.block_length_distribution)[0]
+                    ]
+                    masked_count = masked_lengths[0]
+                else:
+                    masked_lengths, masked_count = [], 0
+                    while masked_count < int(
+                            self.bert_ratio * len(sample['text'])):
+                        block_length = rng.choices(
+                            range(1,
+                                  len(self.block_length_distribution) + 1),
+                            weights=self.block_length_distribution)[0]
+                        masked_lengths.append(block_length)
+                        masked_count += block_length
+                if self.masked_lm:
+                    sep = len(sample['text'])
+                else:
+                    sep = len(
+                        sample['text']) - masked_count + len(masked_lengths)
+                data = self.generate_blank_data(
+                    sample, masked_lengths, sep, rng, task='bert')
+                if data is not None:
+                    if self.encoder_decoder:
+                        source_tokens, target_tokens, loss_masks = data
+                        source_batch.append(source_tokens)
+                        target_batch.append(target_tokens)
+                        loss_mask_batch.append(loss_masks)
+                    else:
+                        tokens, targets, loss_masks, position_ids = data
+                        token_batch.append(tokens)
+                        target_batch.append(targets)
+                        loss_mask_batch.append(loss_masks)
+                        position_id_batch.append(position_ids)
+                    attention_mask.append(sep)
+
+        elif rand < self.bert_prob + self.gap_sentence_prob:
+            mode = 'sentence'
+            for sample in samples:
+                tokens, loss_masks = sample['text'], sample['loss_mask']
+                sentence_spans = []
+                last_index = 1 if tokens[0] == self.tokenizer.get_command(
+                    'ENC').Id else 0
+                for i in range(len(tokens)):
+                    if self.contains_sentence_end(tokens[i]):
+                        if last_index < i + 1:
+                            sentence_spans.append((last_index, i + 1))
+                        last_index = i + 1
+                    elif tokens[i] == self.tokenizer.get_command('eos').Id:
+                        last_index = i + 1
+                if last_index < len(tokens):
+                    sentence_spans.append((last_index, len(tokens)))
+                if not sentence_spans and torch.distributed.get_rank() == 0:
+                    try:
+                        print(self.tokenizer.DecodeIds(tokens[1:]))
+                    except IndexError:
+                        print(tokens[1:])
+                rng.shuffle(sentence_spans)
+                block_spans, block_length = [], 0
+                for start, end in sentence_spans:
+                    block_spans.append((start, end))
+                    block_length += end - start
+                    if block_length >= int(
+                            self.gap_sentence_ratio * len(tokens)):
+                        break
+                data = self.make_block_data(
+                    tokens,
+                    loss_masks,
+                    None,
+                    block_spans,
+                    rng,
+                    task='gap_sentence')
+                tokens, targets, loss_masks, position_ids, sep = data
+                token_batch.append(tokens)
+                target_batch.append(targets)
+                loss_mask_batch.append(loss_masks)
+                position_id_batch.append(position_ids)
+                attention_mask.append(sep)
+        else:
+            # start_indices = [index_in_list(sample['loss_mask'], 1) for sample in samples]
+            # end_indices = [rindex(sample['loss_mask'], 1) for sample in samples]
+            # start_index, end_index = max(start_indices), min(end_indices) - self.min_generation_length
+            # if end_index < start_index + 1:
+            #     end_index = start_index + 1
+            # division = rng.randrange(start_index, end_index)
+            mode = 'gpt'
+            max_generation_length = rng.randint(
+                int(self.gpt_min_ratio
+                    * min(map(lambda x: len(x['text']), samples))),
+                max(map(lambda x: len(x['text']), samples)) - 2)
+            for sample in samples:
+                generation_length = min(max_generation_length,
+                                        len(sample['text']) - 2)
+                attention_mask.append(
+                    len(sample['text']) - generation_length + 1)
+                multiple_doc = index_in_list(
+                    sample['text'],
+                    self.tokenizer.get_command('eos').Id) not in [
+                        -1, len(sample['text']) - 1
+                    ]  # noqa
+                if multiple_doc or rng.random() < self.infill_prob:
+                    division = len(sample['text']) - generation_length
+                    tokens, loss_masks = sample['text'], sample['loss_mask']
+                    source_tokens, target_tokens = tokens[:division], tokens[
+                        division:]
+                    target_masks = loss_masks[division:]
+                    tokens = np.concatenate((source_tokens, [
+                        self.generation_mask,
+                        self.tokenizer.get_command('sop').Id
+                    ], target_tokens[:-1]))
+                    targets = np.concatenate(
+                        (source_tokens, [self.generation_mask], target_tokens))
+                    loss_masks = np.concatenate(
+                        (np.zeros(len(source_tokens) + 1,
+                                  dtype=np.long), target_masks))
+                    token_batch.append(tokens)
+                    target_batch.append(targets)
+                    loss_mask_batch.append(loss_masks)
+                    position_ids = np.arange(
+                        len(source_tokens) + len(target_tokens) + 1,
+                        dtype=np.long)
+                    position_ids[len(source_tokens) + 1:] = len(source_tokens)
+                    if self.block_position_encoding:
+                        block_position_ids = np.concatenate(
+                            (np.zeros(len(source_tokens), dtype=np.long),
+                             np.arange(len(target_tokens) + 1, dtype=np.long)))
+                    else:
+                        block_position_ids = np.concatenate(
+                            (np.zeros(len(source_tokens) + 1, dtype=np.long),
+                             np.ones(len(target_tokens) + 1, dtype=np.long)))
+                    position_id_batch.append(
+                        np.stack([position_ids, block_position_ids], axis=0))
+                else:
+                    tokens, targets, loss_masks, position_ids = self.generate_blank_data(
+                        sample, [generation_length],
+                        attention_mask[-1],
+                        rng,
+                        task='generation')
+                    token_batch.append(tokens)
+                    target_batch.append(targets)
+                    loss_mask_batch.append(loss_masks)
+                    position_id_batch.append(position_ids)
+                    if tokens is None:
+                        print(sample, generation_length, multiple_doc)
+        if self.encoder_decoder:
+            return {
+                'text': torch.tensor(source_batch, dtype=torch.long),
+                'target': torch.tensor(target_batch, dtype=torch.long),
+                'loss_mask': torch.tensor(loss_mask_batch, dtype=torch.long)
+            }
+        else:
+            token_batch, target_batch, loss_mask_batch, position_id_batch = self.pad_batch(
+                token_batch, target_batch, loss_mask_batch, position_id_batch)
+            return {
+                'text': torch.tensor(token_batch, dtype=torch.long),
+                'target': torch.tensor(target_batch, dtype=torch.long),
+                'loss_mask': torch.tensor(loss_mask_batch, dtype=torch.long),
+                'position_id':
+                torch.tensor(position_id_batch, dtype=torch.long),
+                'attention_mask':
+                torch.tensor(attention_mask, dtype=torch.long),
+                'mode': mode
+            }
+
+    @staticmethod
+    def pad_batch(token_batch, target_batch, loss_mask_batch,
+                  position_id_batch):
+        seq_lengths = list(map(len, token_batch))
+        if seq_lengths.count(seq_lengths[0]) != len(seq_lengths):
+            max_length = max(seq_lengths)
+            token_batch = [
+                np.concatenate(
+                    (tokens, np.zeros(max_length - len(tokens),
+                                      dtype=np.long)))
+                for tokens in token_batch
+            ]
+            target_batch = [
+                np.concatenate(
+                    (targets,
+                     np.zeros(max_length - len(targets), dtype=np.long)))
+                for targets in target_batch
+            ]
+            loss_mask_batch = [
+                np.concatenate(
+                    (loss_masks,
+                     np.zeros(max_length - len(loss_masks), dtype=np.long)))
+                for loss_masks in loss_mask_batch
+            ]
+            position_id_batch = [
+                np.concatenate((position_ids,
+                                np.zeros(
+                                    (2, max_length - position_ids.shape[1]),
+                                    dtype=np.long)),
+                               axis=1) for position_ids in position_id_batch
+            ]
+        return token_batch, target_batch, loss_mask_batch, position_id_batch
diff --git a/modelscope/models/nlp/mglm/configure_data.py b/modelscope/models/nlp/mglm/configure_data.py
new file mode 100644
index 00000000..6921de08
--- /dev/null
+++ b/modelscope/models/nlp/mglm/configure_data.py
@@ -0,0 +1,513 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""parses arguments and preps data loader"""
+
+import copy
+import os
+import random
+from bisect import bisect_right
+from itertools import accumulate
+
+import numpy as np
+import torch
+import torch.utils.data
+
+from . import data_utils, mpu
+from .blocklm_utils import ConstructBlockStrategy
+from .data_utils.tokenization import make_tokenizer
+from .utils import print_rank_0
+
+
+class MultiTaskDataset(torch.utils.data.Dataset):
+
+    def __init__(self,
+                 tasks,
+                 datasets,
+                 reweight=True,
+                 temperature=0.8,
+                 max_limit=200000):
+        super(MultiTaskDataset, self).__init__()
+        self.tasks = tasks
+        self.datasets = datasets
+        self.reweight = reweight
+        self.temperature = temperature
+        self.lens = [len(dataset) for dataset in datasets]
+        self.weights = np.array(
+            [min(length, max_limit)**temperature for length in self.lens])
+        self.total_len = sum(self.lens)
+        self.cumulative_lens = list(accumulate(self.lens))
+        if self.reweight:
+            print_rank_0(list(zip(self.tasks, self.lens, self.weights)))
+        else:
+            print_rank_0(list(zip(self.tasks, self.lens)))
+        self.weights /= self.weights.sum()
+
+    def __len__(self):
+        return self.total_len * 1000
+
+    @staticmethod
+    def pet_wrapper(data):
+        text = data['text']
+        loss_mask = data['logit_mask']
+        target = data['target']
+        attention_mask = data['mask']
+        position_id = data['position']
+        label = data['label']
+        if len(text.shape) == 2:
+            text = text[label]
+            loss_mask = loss_mask[label]
+            target = target[label]
+            attention_mask = attention_mask[label]
+            position_id = position_id[label]
+        else:
+            target = target[label]
+        if not target.shape:
+            target = target.repeat(len(text))
+        return {
+            'text': text,
+            'target': target,
+            'loss_mask': loss_mask,
+            'position_id': position_id,
+            'attention_mask': attention_mask
+        }
+
+    def __getitem__(self, idx):
+        if self.reweight:
+            rng = random.Random(idx)
+            rng = np.random.RandomState(
+                seed=[rng.randint(0, 2**32 - 1) for _ in range(16)])
+            dataset_idx = rng.choice(
+                np.arange(len(self.datasets)), p=self.weights)
+            dataset = self.datasets[dataset_idx]
+            sample_idx = rng.choice(np.arange(len(dataset)))
+            item = self.datasets[dataset_idx][sample_idx]
+        else:
+            dataset_idx = bisect_right(self.cumulative_lens, idx)
+            if dataset_idx == 0:
+                sample_idx = idx
+            else:
+                sample_idx = idx - self.cumulative_lens[dataset_idx - 1]
+            item = self.datasets[dataset_idx][sample_idx]
+        item = self.pet_wrapper(item)
+        return item
+
+
+class DataConfig:
+
+    def __init__(self, defaults=None):
+        super(DataConfig, self).__init__()
+        if defaults is None:
+            defaults = {}
+        self.defaults = defaults
+
+    def apply(self, args, tokenizer):
+        if torch.distributed.get_rank() == 0:
+            print('configuring data')
+        self.apply_defaults(args)
+        return make_loaders(args, tokenizer)
+
+    def set_defaults(self, **kwargs):
+        for k, v in kwargs.items():
+            self.defaults[k] = v
+
+    def apply_defaults(self, args):
+        for k, v in self.defaults.items():
+            k = k.replace('-', '_')
+            if not hasattr(args, k):
+                setattr(args, k, v)
+
+
+def prepare_tokenizer(args):
+    add_sentinel_token = 0
+    if args.sentinel_token:
+        add_sentinel_token = args.max_position_embeddings
+    tokenizer = make_tokenizer(
+        args.tokenizer_type,
+        None,
+        args.tokenizer_path,
+        args.vocab_size,
+        args.tokenizer_model_type,
+        add_block_symbols=args.block_lm,
+        cache_dir=args.cache_dir,
+        add_sentinel_token=add_sentinel_token,
+        add_task_mask=args.task_mask,
+        add_decoder_mask=args.block_mask_prob > 0.0
+        or args.context_mask_ratio > 0.0)
+    if mpu.get_model_parallel_rank() == 0:
+        num_tokens = tokenizer.num_tokens
+        eod_token = tokenizer.get_command('eos').Id
+        assert eod_token == tokenizer.get_command('pad').Id
+        before = num_tokens
+        after = before
+        multiple = args.make_vocab_size_divisible_by
+        while (after % multiple) != 0:
+            after += 1
+        print_rank_0('> padded vocab (size: {}) with {} dummy '
+                     'tokens (new size: {})'.format(before, after - before,
+                                                    after))
+        print_rank_0('> found end-of-document token: {}'.format(eod_token))
+        token_counts = torch.cuda.LongTensor([after, eod_token])
+    else:
+        token_counts = torch.cuda.LongTensor([0, 0])
+    # Broadcast num tokens.
+    torch.distributed.broadcast(
+        token_counts,
+        mpu.get_model_parallel_src_rank(),
+        group=mpu.get_model_parallel_group())
+    num_tokens = token_counts[0].item()
+    eod_token = token_counts[1].item()
+    args.vocab_size, args.eod_token = num_tokens, eod_token
+    return tokenizer
+
+
+def make_data_loader(dataset,
+                     tokenizer,
+                     batch_size,
+                     num_iters,
+                     args,
+                     shuffle=False,
+                     block_collate=False):
+    world_size = torch.distributed.get_world_size(
+        group=mpu.get_data_parallel_group())
+    rank = torch.distributed.get_rank(group=mpu.get_data_parallel_group())
+    if args.loader_scatter is not None:
+        rank = rank // args.loader_scatter
+        world_size = world_size // args.loader_scatter
+        batch_size = batch_size // args.loader_scatter
+    distributed = world_size > 1
+    if args.transformer_xl:
+        batch_sampler = data_utils.samplers.DistributedSequentialSampler(
+            len(dataset), num_iters, batch_size, rank, world_size)
+    else:
+        if shuffle:
+            sampler = data_utils.samplers.RandomSampler(
+                dataset,
+                replacement=True,
+                num_samples=batch_size * args.train_iters
+                * args.gradient_accumulation_steps)
+        else:
+            sampler = torch.utils.data.SequentialSampler(dataset)
+        drop_last = distributed
+        # the GPUs in the same model parallel group receive the same data
+        if distributed:
+            batch_sampler = data_utils.samplers.DistributedBatchSampler(
+                sampler,
+                batch_size,
+                drop_last,
+                rank,
+                world_size,
+                gradient_accumulation_steps=args.gradient_accumulation_steps)
+        else:
+            batch_sampler = torch.utils.data.BatchSampler(
+                sampler, batch_size, drop_last)
+    collate_fn = None
+    if block_collate:
+        collate_fn = ConstructBlockStrategy(
+            args,
+            tokenizer,
+            args.seq_length,
+            bert_prob=args.bert_prob,
+            gap_sentence_prob=args.gap_sentence_prob,
+            gap_sentence_ratio=args.gap_sentence_ratio,
+            gpt_infill_prob=args.gpt_infill_prob,
+            average_block_length=args.avg_block_length,
+            gpt_min_ratio=args.gpt_min_ratio,
+            block_mask_prob=args.block_mask_prob,
+            context_mask_ratio=args.context_mask_ratio,
+            short_seq_prob=args.short_seq_prob,
+            single_span_prob=args.single_span_prob,
+            shuffle_blocks=not args.no_shuffle_block,
+            block_position_encoding=not args.no_block_position,
+            sentinel_token=args.sentinel_token,
+            encoder_decoder=args.encoder_decoder,
+            task_mask=args.task_mask,
+            random_position=args.random_position,
+            masked_lm=args.masked_lm).construct_blocks
+    data_loader = torch.utils.data.DataLoader(
+        dataset,
+        batch_sampler=batch_sampler,
+        num_workers=args.num_workers,
+        pin_memory=True,
+        collate_fn=collate_fn)
+
+    return data_loader
+
+
+def make_tfrecord_loaders(args):
+    """Load train/val/test dataset from shuffled TFRecords"""
+
+    import data_utils.tf_dl
+    data_set_args = {
+        'batch_size': args.batch_size,
+        'max_seq_len': args.seq_length,
+        'max_preds_per_seq': args.max_preds_per_seq,
+        'train': True,
+        'num_workers': max(args.num_workers, 1),
+        'seed': args.seed + args.rank + 1,
+        'threaded_dl': args.num_workers > 0
+    }
+    train = data_utils.tf_dl.TFRecordDataLoader(args.train_data,
+                                                **data_set_args)
+    data_set_args['train'] = False
+    if args.eval_seq_length is not None:
+        data_set_args['max_seq_len'] = args.eval_seq_length
+    if args.eval_max_preds_per_seq is not None:
+        data_set_args['max_preds_per_seq'] = args.eval_max_preds_per_seq
+    valid = None
+    if args.valid_data is not None:
+        valid = data_utils.tf_dl.TFRecordDataLoader(args.valid_data,
+                                                    **data_set_args)
+    test = None
+    if args.test_data is not None:
+        test = data_utils.tf_dl.TFRecordDataLoader(args.test_data,
+                                                   **data_set_args)
+    tokenizer = data_utils.make_tokenizer(
+        args.tokenizer_type,
+        train,
+        args.tokenizer_path,
+        args.vocab_size,
+        args.tokenizer_model_type,
+        cache_dir=args.cache_dir)
+
+    return (train, valid, test), tokenizer
+
+
+def make_loaders(args, tokenizer):
+    """makes training/val/test"""
+
+    if args.use_tfrecords:
+        return make_tfrecord_loaders(args)
+    world_size = torch.distributed.get_world_size(
+        group=mpu.get_data_parallel_group())
+    if args.loader_scatter is not None:
+        assert world_size % args.loader_scatter == 0
+    batch_size = args.batch_size * world_size
+    eval_batch_size = batch_size
+    if args.eval_batch_size is not None:
+        eval_batch_size = args.eval_batch_size * world_size
+    seq_length = args.seq_length
+    if seq_length < 0:
+        seq_length = seq_length * world_size
+    eval_seq_length = args.eval_seq_length
+    if eval_seq_length is not None and eval_seq_length < 0:
+        eval_seq_length = eval_seq_length * world_size
+    split = get_split(args)
+    data_set_args = {
+        'path': args.train_data,
+        'seq_length': seq_length,
+        'mem_length': args.mem_length,
+        'delim': args.delim,
+        'text_key': args.text_key,
+        'label_key': 'label',
+        'ds_type': args.data_set_type,
+        'split': split,
+        'loose': args.loose_json,
+        'max_preds_per_seq': args.max_preds_per_seq,
+        'presplit_sentences': args.presplit_sentences,
+        'sample_one_document': args.sample_one_document,
+        'filter_english': args.filter_english,
+        'pre_tokenize': not args.no_pre_tokenize,
+        'tokenizer': tokenizer,
+        'save_splits': args.save_splits,
+        'load_splits': args.load_splits,
+        'save_test_data': args.save_test_data,
+        'no_lazy_loader': args.no_lazy_loader,
+        'loader_scatter': args.loader_scatter,
+        'data_parallel_rank': mpu.get_data_parallel_rank(),
+        'non_sentence_start': args.non_sentence_start,
+        'half_lazy_loader': args.half_lazy_loader
+    }
+
+    eval_set_args = copy.copy(data_set_args)
+    eval_set_args['split'] = [1.]
+    # if optional eval args were set then replace their
+    # equivalent values in the arg dict
+    if eval_seq_length:
+        eval_set_args['seq_length'] = eval_seq_length
+    if args.eval_max_preds_per_seq:
+        eval_set_args['max_preds_per_seq'] = args.eval_max_preds_per_seq
+    if args.eval_text_key is not None:
+        eval_set_args['text_key'] = args.eval_text_key
+
+    # make datasets splits and tokenizer
+    train, valid, test = None, None, None
+
+    if args.train_data is not None:
+        train = data_utils.make_dataset(**data_set_args)
+        if data_utils.should_split(split):
+            train, valid, test = train
+        eval_set_args['tokenizer'] = tokenizer
+
+    # make training and val dataset if necessary
+    if valid is None and args.valid_data is not None:
+        eval_set_args['path'] = args.valid_data
+        valid = data_utils.make_dataset(**eval_set_args)
+        eval_set_args['tokenizer'] = tokenizer
+    if test is None and args.test_data is not None:
+        eval_set_args['path'] = args.test_data
+        test = data_utils.make_dataset(**eval_set_args)
+
+    # wrap datasets with data loader
+    use_block = args.block_lm or args.encoder_decoder
+
+    if train is not None and args.batch_size > 0:
+        train = make_data_loader(
+            train,
+            tokenizer,
+            batch_size,
+            args.train_iters,
+            args,
+            shuffle=args.shuffle,
+            block_collate=use_block)
+        args.do_train = True
+    else:
+        args.do_train = False
+    eval_batch_size = eval_batch_size if eval_batch_size != 0 else batch_size
+    if valid is not None:
+        valid = make_data_loader(
+            valid,
+            tokenizer,
+            eval_batch_size,
+            args.train_iters,
+            args,
+            shuffle=args.shuffle,
+            block_collate=use_block)
+        args.do_valid = True
+    else:
+        args.do_valid = False
+    if test is not None:
+        test = make_data_loader(
+            test,
+            tokenizer,
+            eval_batch_size,
+            len(test) // eval_batch_size + 1,
+            args,
+            shuffle=args.shuffle,
+            block_collate=use_block)
+        args.do_test = True
+    else:
+        args.do_test = False
+
+    return train, valid, test
+
+
+def build_multi_task_dataset(args, tokenizer):
+    task_dirs = {
+        'mnli': 'MNLI',
+        'cola': 'CoLA',
+        'mrpc': 'MRPC',
+        'qnli': 'QNLI',
+        'qqp': 'QQP',
+        'sst2': 'SST-2',
+        'agnews': 'Agnews',
+        'yelp-polarity': 'yelp_review_polarity_csv',
+        'yelp-full': 'yelp_review_full_csv',
+        'yahoo': 'Yahoo',
+        'squad': 'SQuAD',
+        'race': 'RACE'
+    }
+    train, valid = None, None
+    if mpu.get_model_parallel_rank() == 0:
+        multi_seq_length = args.seq_length
+        if args.multi_seq_length is not None:
+            multi_seq_length = args.multi_seq_length
+        train_datasets, valid_datasets = [], []
+        for task in args.multi_task_data:
+            task = task.lower()
+            data_dir = os.path.join(args.data_dir, task_dirs[task])
+            train_datasets.append(
+                SuperGlueDataset(
+                    args,
+                    task,
+                    data_dir,
+                    multi_seq_length,
+                    'train',
+                    tokenizer,
+                    pattern_ensemble=True))
+            valid_datasets.append(
+                SuperGlueDataset(
+                    args,
+                    task,
+                    data_dir,
+                    multi_seq_length,
+                    'dev',
+                    tokenizer,
+                    pattern_ensemble=True))
+        train = MultiTaskDataset(args.multi_task_data, train_datasets)
+        valid = MultiTaskDataset(args.multi_task_data, valid_datasets)
+        world_size = torch.distributed.get_world_size(
+            group=mpu.get_data_parallel_group())
+        multi_batch_size = args.batch_size * world_size
+        if args.multi_batch_size is not None:
+            multi_batch_size = args.multi_batch_size * world_size
+        train = make_data_loader(
+            train,
+            tokenizer,
+            multi_batch_size,
+            args.train_iters,
+            args,
+            shuffle=True)
+        valid = make_data_loader(
+            valid,
+            tokenizer,
+            multi_batch_size,
+            args.train_iters,
+            args,
+            shuffle=True)
+    return train, valid
+
+
+def get_split(args):
+    """
+    Get dataset splits from comma separated string list
+    """
+    splits = []
+    if args.split.find(',') != -1:
+        splits = [float(s) for s in args.split.split(',')]
+    elif args.split.find('/') != -1:
+        splits = [float(s) for s in args.split.split('/')]
+    else:
+        splits = [float(args.split)]
+    split_total = sum(splits)
+    if split_total < 1.:
+        splits.append(1 - split_total)
+    while len(splits) < 3:
+        splits.append(0.)
+    splits = splits[:3]
+    if args.valid_data is not None:
+        splits[1] = 0.
+    if args.test_data is not None:
+        splits[2] = 0.
+    final_sum = sum(splits)
+    return [s / final_sum for s in splits]
+
+
+def configure_data():
+    """add cmdline flags for configuring datasets"""
+    # These are options that are used by data_utils, but are either
+    # deprecated or not meant to be exposed to the command line user.
+    # These options are intneded to be set in code by specific scripts.
+    defaults = {
+        'world_size': 1,
+        'rank': -1,
+        'persist_state': 0,
+        'lazy': False,
+        'transpose': False,
+        'data_set_type': 'supervised',
+        'seq_length': 256,
+        'eval_seq_length': 256,
+        'samples_per_shard': 100
+    }
+
+    return DataConfig(defaults=defaults)
diff --git a/modelscope/models/nlp/mglm/data_utils/__init__.py b/modelscope/models/nlp/mglm/data_utils/__init__.py
new file mode 100644
index 00000000..fa243cb4
--- /dev/null
+++ b/modelscope/models/nlp/mglm/data_utils/__init__.py
@@ -0,0 +1,341 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""utils for creating datasets"""
+import math
+import os
+import random
+import time
+
+import torch
+
+from . import corpora
+from .datasets import (BertSentencepairDataset, BlockDataset, ConcatDataset,
+                       GPT2Dataset, ShuffleDataset, SplitDataset, XLDataset,
+                       split_ds)
+from .lazy_loader import (LazyLoader, LazyWriter, exists_lazy, exists_scatter,
+                          get_scatter_path)
+from .samplers import DistributedBatchSampler
+from .tokenization import (BertWordPieceTokenizer, CharacterLevelTokenizer,
+                           CommandToken, GPT2BPETokenizer, Tokenization,
+                           Tokenizer, make_tokenizer)
+
+TRAIN_DATA = 0
+VAL_DATA = 1
+TEST_DATA = 2
+
+
+def should_split(split):
+    """
+    given split proportions checks if should split
+    Examples:
+    >>> should_split([10,0,0])
+    False
+    >>> should_split([1,.1,.2])
+    True
+    """
+    return max(split) / sum(split) != 1.
+
+
+def get_ext(path):
+    """gets path extension"""
+    return os.path.splitext(path)[1]
+
+
+def get_dataset(name,
+                tokenizer,
+                pre_tokenize,
+                data_parallel_rank,
+                loader_scatter=None,
+                no_lazy_loader=False,
+                half_lazy_loader=False):
+    """gets dataset object based on keyword args and file at `path`"""
+    global_rank = torch.distributed.get_rank()
+    if not supported_corpus(name):
+        raise NotImplementedError('dataset %s is not supported' % name)
+    dataset = corpora.NAMED_CORPORA[name]
+    path = dataset.PATH
+    if issubclass(dataset, corpora.PromptReader):
+        if not (exists_lazy(path, data_type='prompt')
+                and exists_lazy(path, data_type='text')) and not (
+                    loader_scatter is not None and exists_scatter(
+                        path, data_type='prompt', scatter_num=loader_scatter)
+                    and exists_scatter(
+                        path, data_type='text', scatter_num=loader_scatter)):
+            # create cached version of dataset for lazy loading if it doesn't exist
+            if global_rank == 0:
+                print(f'Creating lazy loader for dataset {name}')
+                prompt_writer = LazyWriter(
+                    path, data_type='prompt', is_array=pre_tokenize)
+                text_writer = LazyWriter(
+                    path, data_type='text', is_array=pre_tokenize)
+                writers = {'prompt': prompt_writer, 'text': text_writer}
+                reader = dataset(
+                    writers=writers,
+                    tokenizer=tokenizer,
+                    tokenize=pre_tokenize)
+                reader.process()
+                prompt_writer.close()
+                text_writer.close()
+            else:
+                while not os.path.exists(
+                        LazyWriter.get_len_path(path, data_type='prompt')):
+                    time.sleep(1)
+        map_fn = (lambda x: x.tolist()) if pre_tokenize else None
+        if loader_scatter is not None:
+            if not (exists_scatter(
+                    path, data_type='prompt', scatter_num=loader_scatter)
+                    and exists_scatter(
+                        path, data_type='text', scatter_num=loader_scatter)):
+                if global_rank == 0:
+                    print(f'Creating scatter loader for dataset {name}')
+                    prompts = LazyLoader(
+                        path,
+                        data_type='prompt',
+                        map_fn=map_fn,
+                        mem_map=True,
+                        is_array=pre_tokenize)
+                    texts = LazyLoader(
+                        path,
+                        data_type='text',
+                        map_fn=map_fn,
+                        mem_map=True,
+                        is_array=pre_tokenize)
+                    indices = list(range(len(texts)))
+                    random.shuffle(indices)
+                    segment_length = (len(indices) - 1) // loader_scatter + 1
+                    for i in range(loader_scatter):
+                        scatter_path = get_scatter_path(path, scatter_rank=i)
+                        prompt_writer = LazyWriter(
+                            scatter_path,
+                            data_type='prompt',
+                            is_array=pre_tokenize)
+                        text_writer = LazyWriter(
+                            scatter_path,
+                            data_type='text',
+                            is_array=pre_tokenize)
+                        for idx in indices[i * segment_length:(i + 1)
+                                           * segment_length]:
+                            prompt_writer.write(prompts[idx])
+                            text_writer.write(texts[idx])
+                        prompt_writer.close()
+                        text_writer.close()
+                else:
+                    while not (exists_scatter(
+                            path, data_type='prompt',
+                            scatter_num=loader_scatter) and exists_scatter(
+                                path,
+                                data_type='text',
+                                scatter_num=loader_scatter)):
+                        time.sleep(1)
+            scatter_path = get_scatter_path(
+                path, scatter_rank=data_parallel_rank % loader_scatter)
+            print(f'Rank {global_rank} is using scatter from {scatter_path}')
+            prompts = LazyLoader(
+                scatter_path,
+                data_type='prompt',
+                map_fn=map_fn,
+                mem_map=True,
+                is_array=pre_tokenize,
+                load_memory=no_lazy_loader,
+                half_load=half_lazy_loader)
+            texts = LazyLoader(
+                scatter_path,
+                data_type='text',
+                map_fn=map_fn,
+                mem_map=True,
+                is_array=pre_tokenize,
+                load_memory=no_lazy_loader,
+                half_load=half_lazy_loader)
+        else:
+            prompts = LazyLoader(
+                path,
+                data_type='prompt',
+                map_fn=map_fn,
+                mem_map=True,
+                is_array=pre_tokenize,
+                load_memory=no_lazy_loader,
+                half_load=half_lazy_loader)
+            texts = LazyLoader(
+                path,
+                data_type='text',
+                map_fn=map_fn,
+                mem_map=True,
+                is_array=pre_tokenize,
+                load_memory=no_lazy_loader,
+                half_load=half_lazy_loader)
+        text = corpora.PromptDataset(
+            prompt_loader=prompts,
+            text_loader=texts,
+            tokenizer=tokenizer,
+            to_tokenize=not pre_tokenize)
+        if loader_scatter is None:
+            if global_rank == 0:
+                print(f'Create dataset {name} with {len(text)} documents')
+                for i in range(10):
+                    rand_id = i if i < 5 else random.randrange(len(text))
+                    sample_tokens = text[rand_id]['tokens'][:1024]
+                    print(sample_tokens)
+                    print(tokenizer.DecodeIds(sample_tokens).encode('utf-8'))
+        else:
+            for scatter_id in range(loader_scatter):
+                if data_parallel_rank % loader_scatter == scatter_id and data_parallel_rank // loader_scatter == 0:
+                    print(
+                        f'Create dataset {name} at scatter {scatter_id} with {len(text)} documents'
+                    )
+                    for i in range(10):
+                        sample_tokens = text[i]['tokens'][:1024]
+                        print(sample_tokens)
+                        print(tokenizer.DecodeIds(sample_tokens))
+                torch.distributed.barrier()
+        return text
+    elif issubclass(dataset, corpora.KeyReader):
+        if not (exists_lazy(path, data_type='text')
+                and exists_lazy(path, data_type='mask')):
+            # create cached version of dataset for lazy loading if it doesn't exist
+            if global_rank == 0:
+                text_writer = LazyWriter(
+                    path, data_type='text', is_array=pre_tokenize)
+                mask_writer = LazyWriter(path, data_type='mask', is_array=True)
+                writers = {'mask': mask_writer, 'text': text_writer}
+                dataset(
+                    writers=writers,
+                    tokenizer=tokenizer,
+                    tokenize=pre_tokenize)
+                mask_writer.close()
+                text_writer.close()
+            else:
+                while not os.path.exists(
+                        LazyWriter.get_len_path(path, data_type='mask')):
+                    time.sleep(1)
+        map_fn = (lambda x: x.tolist()) if pre_tokenize else None
+        masks = LazyLoader(
+            path, data_type='mask', map_fn=map_fn, mem_map=True, is_array=True)
+        texts = LazyLoader(
+            path,
+            data_type='text',
+            map_fn=map_fn,
+            mem_map=True,
+            is_array=pre_tokenize)
+        text = corpora.KeyDataset(
+            mask_loader=masks,
+            text_loader=texts,
+            tokenizer=tokenizer,
+            to_tokenize=not pre_tokenize)
+        return text
+
+
+def supported_corpus(corpus_name):
+    """checks if corpus name is defined in `corpora.py`"""
+    return corpus_name in corpora.NAMED_CORPORA
+
+
+def make_dataset(path,
+                 seq_length,
+                 mem_length,
+                 shuffle=True,
+                 split=None,
+                 tokenizer=None,
+                 sample_one_document=False,
+                 pre_tokenize=False,
+                 ds_type='',
+                 save_splits=None,
+                 load_splits=None,
+                 save_test_data=None,
+                 no_lazy_loader=False,
+                 loader_scatter=None,
+                 data_parallel_rank=None,
+                 filter_english=False,
+                 non_sentence_start=0.0,
+                 half_lazy_loader=False,
+                 **kwargs):
+    """function to create datasets+tokenizers for common options"""
+    if split is None:
+        split = [1.]
+
+    # get one or multiple datasets and concatenate
+    if isinstance(path, str):
+        ds = get_dataset(
+            path,
+            tokenizer=tokenizer,
+            pre_tokenize=pre_tokenize,
+            no_lazy_loader=no_lazy_loader,
+            loader_scatter=loader_scatter,
+            data_parallel_rank=data_parallel_rank,
+            half_lazy_loader=half_lazy_loader)
+    else:
+        ds = [
+            get_dataset(
+                p,
+                tokenizer=tokenizer,
+                pre_tokenize=pre_tokenize,
+                no_lazy_loader=no_lazy_loader,
+                loader_scatter=loader_scatter,
+                data_parallel_rank=data_parallel_rank,
+                half_lazy_loader=half_lazy_loader) for p in path
+        ]
+        ds = ConcatDataset(ds)
+
+    # Split dataset into train/val/test (and wrap bert dataset)
+    def wrap_dataset(dataset):
+        if ds_type.lower() == 'bert':
+            presplit_sentences = kwargs[
+                'presplit_sentences'] if 'presplit_sentences' in kwargs else False
+            dataset = BertSentencepairDataset(
+                dataset,
+                max_seq_len=seq_length,
+                presplit_sentences=presplit_sentences)
+        elif ds_type.lower() == 'gpt-xl':
+            assert pre_tokenize
+            dataset = XLDataset(
+                dataset,
+                tokenizer,
+                max_seq_len=seq_length,
+                mem_len=mem_length,
+                sample_across_doc=not sample_one_document)
+        elif ds_type.lower() == 'gpt2':
+            dataset = GPT2Dataset(
+                dataset,
+                tokenizer,
+                max_seq_len=seq_length,
+                sample_across_doc=not sample_one_document)
+        elif ds_type.lower() == 'block':
+            dataset = BlockDataset(
+                dataset,
+                tokenizer,
+                max_seq_len=seq_length,
+                sample_across_doc=not sample_one_document,
+                filter_english=filter_english,
+                non_sentence_start=non_sentence_start)
+        return dataset
+
+    if should_split(split):
+        ds = split_ds(
+            ds,
+            split,
+            shuffle=shuffle,
+            save_splits=save_splits,
+            load_splits=load_splits)
+        if save_test_data is not None and torch.distributed.get_rank() == 0:
+            test_ds = ds[-1]
+            with open(save_test_data, 'w', encoding='utf-8') as output:
+                for data in test_ds:
+                    text = data['tokens']
+                    text = tokenizer.DecodeIds(text)
+                    output.write(text)
+                    output.write('\n')
+            print(f'Write test data to {save_test_data}')
+        ds = [wrap_dataset(d) if d is not None else None for d in ds]
+    else:
+        ds = wrap_dataset(ds)
+    return ds
diff --git a/modelscope/models/nlp/mglm/data_utils/corpora.py b/modelscope/models/nlp/mglm/data_utils/corpora.py
new file mode 100755
index 00000000..7c6f58f8
--- /dev/null
+++ b/modelscope/models/nlp/mglm/data_utils/corpora.py
@@ -0,0 +1,583 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""several datasets with preset arguments"""
+import os
+import random
+from collections import defaultdict
+from multiprocessing import Process, Queue
+from queue import Empty
+
+import json
+import tqdm
+from torch.utils import data
+
+from modelscope.models.nlp.mglm.utils import print_rank_0
+from .datasets import csv_dataset, json_dataset
+from .lazy_loader import LazyLoader
+
+NUM_PROCESSES = 100
+
+
+def punctuation_standardization(string: str):
+    punctuation_dict = {
+        '\u201c': "\"",
+        '\u201d': "\"",
+        '\u2019': "'",
+        '\u2018': "'",
+        '\u2013': '-'
+    }
+    for key, value in punctuation_dict.items():
+        string = string.replace(key, value)
+    return string
+
+
+class KeyDataset(data.Dataset):
+
+    def __init__(self, text_loader, mask_loader, **kwargs):
+        self.texts = text_loader
+        self.masks = mask_loader
+        self.is_lazy = False
+        if isinstance(self.texts, LazyLoader) and isinstance(
+                self.masks, LazyLoader):
+            self.text_lens = self.texts.lens
+            self.is_lazy = True
+
+    def get_text_len(self, idx):
+        return self.text_lens[idx]
+
+    def __getitem__(self, index):
+        text = self.texts[index]
+        mask_length = self.masks[index]
+        mask = []
+        for i, length in enumerate(mask_length):
+            if i % 2 == 0:
+                mask += [0] * length
+            else:
+                mask += [1] * length
+        assert len(text) == len(mask)
+        return {'tokens': text, 'loss_masks': mask}
+
+    def __len__(self):
+        return len(self.texts)
+
+
+class PromptDataset(data.Dataset):
+
+    def __init__(self,
+                 prompt_loader,
+                 text_loader,
+                 tokenizer=None,
+                 to_tokenize=False,
+                 **kwargs):
+        self.prompts = prompt_loader
+        self.texts = text_loader
+        self.tokenizer = tokenizer
+        self.to_tokenize = to_tokenize
+        if isinstance(self.prompts, LazyLoader) and isinstance(
+                self.texts, LazyLoader):
+            self.prompt_lens = self.prompts.lens
+            self.text_lens = self.texts.lens
+            self.is_lazy = True
+
+    def get_text_len(self, idx):
+        return self.prompt_lens[idx] + self.text_lens[idx]
+
+    def __getitem__(self, index):
+        prompt = self.prompts[index]
+        text = self.texts[index]
+        if self.to_tokenize:
+            prompt = self.tokenizer.EncodeAsIds(prompt).tokenization
+            text = self.tokenizer.EncodeAsIds(text).tokenization
+        return {
+            'tokens': prompt + text,
+            'loss_masks': [0] * len(prompt) + [1] * len(text)
+        }
+
+    def __len__(self):
+        return len(self.prompts)
+
+
+class DataReader:
+    PATH = None
+    assert_str = None
+    reserve_punct = False
+    split_row = True
+    TASK_QUEUE_LIMIT = 10000000
+    DONE_QUEUE_LIMIT = 10000000
+
+    def tokenize_worker(self, input, output, info, tokenizer, tokenize):
+        raise NotImplementedError
+
+    def print_info(self, info):
+        pass
+
+    def __init__(self, writers, tokenizer=None, tokenize=False, **kwargs):
+        print(self.PATH)
+        print(self.assert_str)
+        assert os.path.exists(self.PATH), self.assert_str
+        print_rank_0(f'Creating dataset from {self.PATH}')
+        self.tokenizer = tokenizer
+        self.tokenize = tokenize
+        self.writers = writers
+
+    def process(self):
+        if os.path.isdir(self.PATH):
+            paths = [
+                os.path.join(top, name) for top, _, names in os.walk(self.PATH)
+                for name in names
+            ]
+            # paths = [entry.path for entry in os.scandir(self.PATH) if
+            #          not entry.is_dir() and not entry.name.endswith("bz2")]
+        else:
+            paths = [self.PATH]
+        task_queue, done_queue, info_queue = Queue(
+            maxsize=self.TASK_QUEUE_LIMIT), Queue(
+                maxsize=self.DONE_QUEUE_LIMIT), Queue()
+        processes = []
+        for i in range(NUM_PROCESSES):
+            process = Process(
+                target=self.tokenize_worker,
+                args=(task_queue, done_queue, info_queue, self.tokenizer,
+                      self.tokenize))
+            process.start()
+            processes.append(process)
+
+        def read_input_to_queue():
+            for path in paths:
+                print_rank_0(f'Start reading {path}')
+                with open(path) as file:
+                    items = json.load(file)
+                    for item in items:
+                        task_queue.put(item)
+                    # if self.split_row:
+                    #     for row in file:
+                    #         task_queue.put(row)
+                    # else:
+                    #     items = json.load(file)
+                    #     for item in items["RECORDS"]:
+                    #         task_queue.put(item)
+            print_rank_0('Read input complete')
+            for i in range(len(processes)):
+                task_queue.put('STOP')
+
+        process = Process(target=read_input_to_queue)
+        process.start()
+        count = len(processes)
+        progress_bar = tqdm.tqdm()
+        while True:
+            data = done_queue.get()
+            if data == 'COMPLETE':
+                count -= 1
+                if count == 0:
+                    break
+            else:
+                self.write_result(data, self.writers)
+                progress_bar.update()
+        progress_bar.close()
+        self.print_info(info_queue)
+
+    @staticmethod
+    def write_result(data, writers):
+        raise NotImplementedError
+
+    @staticmethod
+    def get_token_count(contents):
+        return sum(map(len, contents))
+
+    @classmethod
+    def process_sample(cls, text, tokenizer, tokenize):
+        if isinstance(text, str) and tokenize:
+            if not cls.reserve_punct:
+                text = punctuation_standardization(text)
+            text = tokenizer.EncodeAsIds(text).tokenization if text else []
+        return text
+
+    @staticmethod
+    def trim_field(content, max_length):
+        if len(content) > max_length:
+            content = content[:max_length]
+            content += '......'
+        return content
+
+    def process_line(self, data, tokenizer, tokenize):
+        raise NotImplementedError
+
+
+class PromptReader(DataReader):
+    is_json = True
+
+    def tokenize_worker(self, input, output, info, tokenizer, tokenize):
+        for row in iter(input.get, 'STOP'):
+            if row:
+                if self.is_json:
+                    row = row.rstrip()
+                    row = json.loads(row)
+                prompts, texts = self.process_line(row, tokenizer, tokenize)
+                for prompt, text in zip(prompts, texts):
+                    output.put((prompt, text))
+        output.put('COMPLETE')
+
+    @staticmethod
+    def write_result(data, writers):
+        prompt, text = data
+        writers['prompt'].write(prompt)
+        writers['text'].write(text)
+
+
+class KeyReader(DataReader):
+    PATH = '/root/data/wikipedia/wiki-key.txt'
+    assert_str = 'make sure to set PATH for wikipedia data_utils/corpora.py'
+
+    def process_line(self, data, tokenizer, tokenize):
+        keys, contents = data['key'], data['content']
+        assert len(keys) == len(contents)
+        for i in range(1, len(keys)):
+            keys[i] = ' ' + keys[i]
+        contents = [' ' + content for content in contents]
+        keys = [tokenizer.EncodeAsIds(key).tokenization for key in keys]
+        contents = [
+            tokenizer.EncodeAsIds(content).tokenization for content in contents
+        ]
+        summary = sum(keys, [])
+        summary_prefix = self.process_sample('Summary: ', tokenizer, tokenize)
+        summary_mask = [len(summary_prefix), len(summary)]
+        summary = summary_prefix + summary
+        text, text_mask = [], []
+        for key, content in zip(keys, contents):
+            content = content + [tokenizer.get_command('eop').Id]
+            text += key
+            text += content
+            text_mask.append(len(key))
+            text_mask.append(len(content))
+        return (summary, summary_mask), (text, text_mask)
+
+    def tokenize_worker(self, input, output, info, tokenizer, tokenize):
+        for row in iter(input.get, 'STOP'):
+            data = json.loads(row)
+            summary, content = self.process_line(data, tokenizer, tokenize)
+            output.put((summary, content))
+        output.put('COMPLETE')
+
+    @staticmethod
+    def write_result(data, writers):
+        summary, content = data
+        writers['text'].write(summary[0])
+        writers['mask'].write(summary[1])
+        writers['text'].write(content[0])
+        writers['mask'].write(content[1])
+
+
+class zhihu(PromptReader):
+    PATH = '/dataset/fd5061f6/data/tokenize_data/zhihu.lazy'
+    reserve_punct = True
+    assert_str = 'make sure to set PATH for zhihu data_utils/corpora.py'
+    qtitle_prefix = '问题：'
+    qcontent_prefix = '问题描述：'
+    user_prefix = '回答用户：'
+    answer_prefix = ' 回答：'
+
+    # qtitle_prefix = []
+    # qcontent_prefix = []
+    # user_prefix = []
+    # answer_prefix = []
+
+    def process_line(self, data, tokenizer, tokenize):
+        prompts, texts = [], []
+        ans_length = len(data.get('ans-content', ''))
+        ans_up = data.get('ans-up-num', '')
+        ans_up = int(ans_up) if ans_up else 0
+        if ans_length > 100 or ans_up > 1000:
+            qtitle = data['q_title']
+            qcontent = data['q-content']
+            if qcontent is None:
+                qcontent = ''
+            qcontent = self.trim_field(qcontent, max_length=100)
+            user = data.get('user-signature', '')
+            prompt = self.qtitle_prefix + qtitle + self.qcontent_prefix + qcontent + self.user_prefix + user + self.answer_prefix  # noqa
+            text = data['ans-content']
+            prompt, text = self.process_sample(prompt, tokenizer,
+                                               tokenize), self.process_sample(
+                                                   text, tokenizer, tokenize)
+            prompts.append(prompt)
+            texts.append(text)
+        # prompt = data["q_title"] + data["q-content"] + data["user-signature"]
+        # text = data["ans-content"]
+        # prompts.append(prompt)
+        # texts.append(text)
+        return prompts, texts
+
+
+class zhidao(PromptReader):
+    PATH = '/root/data/zhidao/zhidao'
+    reserve_punct = True
+    assert_str = 'make sure to set PATH for zhidao data_utils/corpora.py'
+    qtitle_prefix = '问题：'
+    qcontent_prefix = '问题描述：'
+    answer_prefix = '回答：'
+
+    def process_line(self, data, tokenizer, tokenize):
+        if 'title' not in data:
+            return [], []
+        prompts, texts = [], []
+        qtitle = data['title']
+        qcontent = data.get('content', '')
+        qcontent = self.trim_field(qcontent, max_length=100)
+        prompt = self.qtitle_prefix + qtitle + self.qcontent_prefix + qcontent + self.answer_prefix
+        prompt = self.process_sample(prompt, tokenizer, tokenize)
+        if 'best_answer' in data:
+            text = data['best_answer']['content']
+            if len(text) > 10:
+                text = self.process_sample(text, tokenizer, tokenize)
+                prompts.append(prompt)
+                texts.append(text)
+        for answer in data.get('other_answers', []):
+            text = answer['content']
+            if len(text) > 100:
+                text = self.process_sample(text, tokenizer, tokenize)
+                prompts.append(prompt)
+                texts.append(text)
+        return prompts, texts
+
+
+class baike(PromptReader):
+    PATH = '/dataset/fd5061f6/data/tokenize_data/baike.lazy'
+    reserve_punct = True
+    assert_str = 'make sure to set PATH for baike data_utils/corpora.py'
+
+    def process_line(self, data, tokenizer, tokenize):
+        prompts, texts = [], []
+        text = data.get('title', '') + data.get('abstract', '') + data.get(
+            'content', '')
+        if text:
+            p, t = self.process_sample('', tokenizer,
+                                       tokenize), self.process_sample(
+                                           text, tokenizer, tokenize)
+            prompts.append(p)
+            texts.append(t)
+        return prompts, texts
+
+
+class wikipedia(PromptReader):
+    """
+    dataset for wikipedia with arguments configured for convenience
+
+    command line usage: `--train-data wikipedia`
+    """
+    # PATH = '/dataset/data/wiki.txt'
+    PATH = '/root/data/bert_data/wiki.txt'
+    assert_str = 'make sure to set PATH for wikipedia data_utils/corpora.py'
+
+    def process_line(self, data, tokenizer, tokenize):
+        text = data['text']
+        prompt, text = self.process_sample('', tokenizer,
+                                           tokenize), self.process_sample(
+                                               text, tokenizer, tokenize)
+        return [prompt], [text]
+
+
+class TestDataset(PromptReader):
+    PATH = '/root/data/test.json'
+    assert_str = 'make sure to set PATH for wikipedia data_utils/corpora.py'
+
+    def process_line(self, data, tokenizer, tokenize):
+        prompt, text = data['prompt'], data['text']
+        prompt, text = self.process_sample(prompt, tokenizer,
+                                           tokenize), self.process_sample(
+                                               text, tokenizer, tokenize)
+        return [prompt], [text]
+
+
+class OpenWebText(PromptReader):
+    PATH = '/dataset/fd5061f6/english_data/openwebtext2'
+    assert_str = 'make sure to set PATH for openwebtext data_utils/corpora.py'
+
+    def __init__(self, *args, **kwargs):
+        import fasttext
+        super().__init__(*args, **kwargs)
+        self.model = fasttext.load_model(
+            '/dataset/fd5061f6/english_data/lid.176.bin')
+        print_rank_0('Load language detection model')
+
+    def process_line(self, data, tokenizer, tokenize):
+        text = data['text']
+        if len(text) > 100:
+            lang = self.model.predict(text.replace('\n', ''))[0][0]
+            if lang == '__label__en':
+                prompt, text = self.process_sample(
+                    '', tokenizer,
+                    tokenize), self.process_sample(text, tokenizer, tokenize)
+                return [prompt], [text]
+        return [], []
+
+
+class CCNews(PromptReader):
+    PATH = '/mnt/cc_news.json'
+    assert_str = 'make sure to set PATH for cc-news data_utils/corpora.py'
+
+    def process_line(self, data, tokenizer, tokenize):
+        text = ''
+        title = data.get('title', None)
+        description = data.get('description', None)
+        maintext = data.get('maintext', None)
+        if title:
+            text += title.strip() + ' '
+        if description and (not maintext
+                            or not maintext.startswith(description)):
+            text += description.strip() + ' '
+        if maintext:
+            text += maintext
+        if len(text) > 100:
+            prompt, text = self.process_sample('', tokenizer,
+                                               tokenize), self.process_sample(
+                                                   text, tokenizer, tokenize)
+            return [prompt], [text]
+        else:
+            return [], []
+
+
+class BertData(PromptReader):
+    is_json = False
+    PATH = '/dataset/fd5061f6/english_data/wikibook'
+
+    def process_line(self, data, tokenizer, tokenize):
+        if data:
+            prompt, text = '', data
+            prompt, text = self.process_sample(prompt, tokenizer,
+                                               tokenize), self.process_sample(
+                                                   text, tokenizer, tokenize)
+            return [prompt], [text]
+        else:
+            return [], []
+
+
+class Pile(PromptReader):
+    is_json = True
+    PATH = '/mnt/train'
+    filtered_sources = [
+        'Github', 'StackExchange', 'DM Mathematics', 'Ubuntu IRC', 'EuroParl',
+        'YoutubeSubtitles', 'Enron Emails'
+    ]
+    downsample_sources = {'PubMed Central': 0.3, 'ArXiv': 0.3, 'FreeLaw': 0.3}
+
+    def print_info(self, info):
+        total_dict = defaultdict(int)
+        while True:
+            try:
+                source_dict = info.get(block=False)
+                for source, length in source_dict.items():
+                    total_dict[source] += length
+            except Empty:
+                break
+        print_rank_0(total_dict)
+
+    def tokenize_worker(self, input, output, info, tokenizer, tokenize):
+        source_dict = defaultdict(int)
+        for row in iter(input.get, 'STOP'):
+            row = row.rstrip()
+            if row:
+                if self.is_json:
+                    row = json.loads(row)
+                prompts, texts, source = self.process_line(
+                    row, tokenizer, tokenize)
+                length = 0
+                for prompt, text in zip(prompts, texts):
+                    length += len(text)
+                    output.put((prompt, text))
+                if source:
+                    source_dict[source] += length
+        output.put('COMPLETE')
+        info.put(source_dict)
+
+    def process_line(self, data, tokenizer, tokenize):
+        source = data['meta'].get('pile_set_name', None)
+        text = data.get('text', None)
+        if source and text:
+            if source in self.filtered_sources:
+                return [], [], None
+            elif source in self.downsample_sources and random.random(
+            ) > self.downsample_sources[source]:
+                return [], [], None
+            else:
+                prompt, text = self.process_sample(
+                    '', tokenizer,
+                    tokenize), self.process_sample(text, tokenizer, tokenize)
+                return [prompt], [text], source
+        else:
+            return [], [], None
+
+
+class Stories(PromptReader):
+    is_json = True
+    PATH = '/dataset/fd5061f6/english_data/stories_31G.jsonl'
+
+    def process_line(self, data, tokenizer, tokenize):
+        text = data.get('text', None)
+        if text:
+            prompt, text = self.process_sample('', tokenizer,
+                                               tokenize), self.process_sample(
+                                                   text, tokenizer, tokenize)
+            return [prompt], [text]
+        else:
+            return [], []
+
+
+class BertBaseData(BertData):
+    PATH = '/root/data/formatted_one_article_per_line'
+
+
+class BertLargeData(BertData):
+    PATH = '/dataset/c07bd62b/cognitive/zhengxiao/formatted_one_article_per_line_large'
+
+
+class WuDaoCorpus(PromptReader):
+    # PATH = "/dataset/fd5061f6/chinese_data/WuDao"
+    PATH = '/wudao'
+    is_json = False
+    reserve_punct = True
+    split_row = False
+
+    def process_line(self, item, tokenizer, tokenize):
+        prompts, texts = [], []
+        text = ''
+        title = item.get('title', None)
+        content = item.get('content', None)
+        if title:
+            text += title.strip() + ' '
+        if content:
+            text += content
+        if len(text) > 100:
+            prompt, text = self.process_sample('', tokenizer,
+                                               tokenize), self.process_sample(
+                                                   text, tokenizer, tokenize)
+            prompts.append(prompt)
+            texts.append(text)
+        return prompts, texts
+
+
+NAMED_CORPORA = {
+    'wikipedia': wikipedia,
+    'wikipedia-key': KeyReader,
+    'openwebtext': OpenWebText,
+    'zhihu': zhihu,
+    'zhidao': zhidao,
+    'baike': baike,
+    'test': TestDataset,
+    'wikibook': BertData,
+    'bert-base': BertBaseData,
+    'bert-large': BertLargeData,
+    'cc-news': CCNews,
+    'pile': Pile,
+    'stories': Stories,
+    'wudao': WuDaoCorpus
+}
diff --git a/modelscope/models/nlp/mglm/data_utils/datasets.py b/modelscope/models/nlp/mglm/data_utils/datasets.py
new file mode 100644
index 00000000..777b7d43
--- /dev/null
+++ b/modelscope/models/nlp/mglm/data_utils/datasets.py
@@ -0,0 +1,1244 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""dataset objects for jsons, csvs, and BERT datasets"""
+
+import csv
+import math
+import os
+import random
+import time
+from bisect import bisect_right
+from itertools import accumulate
+from operator import itemgetter
+
+import json
+import nltk
+import numpy as np
+import pandas as pd
+import torch
+import tqdm
+from nltk import tokenize
+from torch.utils import data
+
+from modelscope.models.nlp.mglm.utils import print_rank_0
+from .lazy_loader import LazyLoader, exists_lazy
+
+
+class ShuffleDataset(data.Dataset):
+
+    def __init__(self, ds):
+        self.ds = ds
+        self.shuffle_ids = list(range(len(self.ds)))
+        random.shuffle(self.shuffle_ids)
+        self.is_lazy = hasattr(ds, 'is_lazy') and ds.is_lazy
+        if self.is_lazy:
+            self.prompt_lens = [
+                self.ds.prompt_lens[idx] for idx in self.shuffle_ids
+            ]
+            self.text_lens = [
+                self.ds.text_lens[idx] for idx in self.shuffle_ids
+            ]
+
+    def __getitem__(self, idx):
+        return self.ds[self.shuffle_ids[idx]]
+
+    def __len__(self):
+        return len(self.ds)
+
+
+class ConcatDataset(data.Dataset):
+    """
+    Dataset to concatenate multiple datasets.
+    Purpose: useful to assemble different existing datasets, possibly
+    large-scale datasets as the concatenation operation is done in an
+    on-the-fly manner.
+    Arguments:
+        datasets (sequence): List of datasets to be concatenated.
+    """
+
+    @staticmethod
+    def cumsum(sequence):
+        r, s = [], 0
+        for e in sequence:
+            l = len(e)  # noqa
+            r.append(l + s)
+            s += l
+        return r
+
+    def __init__(self, datasets, **kwargs):
+        super(ConcatDataset, self).__init__()
+        assert len(datasets) > 0, 'datasets should not be an empty iterable'
+        self.datasets = list(datasets)
+        self.is_lazy = sum([
+            isinstance(ds, LazyLoader)
+            or (hasattr(ds, 'is_lazy') and ds.is_lazy) for ds in self.datasets
+        ]) == len(self.datasets)
+        self.cumulative_sizes = self.cumsum(self.datasets)
+        self._X = None
+        self._Y = None
+        self._lens = None
+
+    def get_text_len(self, idx):
+        dataset_idx = bisect_right(self.cumulative_sizes, idx)
+        if dataset_idx == 0:
+            sample_idx = idx
+        else:
+            sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
+        return self.datasets[dataset_idx].get_text_len(sample_idx)
+
+    def SetTokenizer(self, tokenizer):
+        for ds in self.datasets:
+            ds.SetTokenizer(tokenizer)
+
+    def GetTokenizer(self):
+        return self.datasets[0].GetTokenizer()
+
+    def __len__(self):
+        return self.cumulative_sizes[-1]
+
+    def __getitem__(self, idx):
+        dataset_idx = bisect_right(self.cumulative_sizes, idx)
+        if dataset_idx == 0:
+            sample_idx = idx
+        else:
+            sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
+        return self.datasets[dataset_idx][sample_idx]
+
+    @property
+    def lens(self):
+        if self._lens is None:
+            self._lens = []
+            if self.is_lazy:
+                for data in self.datasets:  # noqa
+                    self._lens.extend(data.lens)
+            else:
+                for data in self.datasets:  # noqa
+                    self._lens.extend([
+                        len(d['text']) if isinstance(d, dict) else len(d)
+                        for d in data
+                    ])
+        return self._lens
+
+    @property
+    def X(self):
+        if self._X is None:
+            self._X = []
+            for data in self.datasets:  # noqa
+                self._X.extend(data.X)
+        return self._X
+
+    @property
+    def Y(self):
+        if self._Y is None:
+            self._Y = []
+            for data in self.datasets:  # noqa
+                self._Y.extend(list(data.Y))
+            self._Y = np.array(self._Y)
+        return self._Y
+
+
+class SplitDataset(data.Dataset):
+    """
+    Dataset wrapper to access a subset of another dataset.
+    Purpose: useful to index into existing datasets, possibly
+    large-scale datasets as the subindexing operation is done in an
+    on-the-fly manner.
+    Arguments:
+        ds (Dataset or array-like): List of datasets to be subindexed
+        split_inds (1D array-like): List of indices part of subset
+    """
+
+    def __init__(self, ds, split_inds, **kwargs):
+        self.split_inds = list(split_inds)
+        self.wrapped_data = ds
+        self.is_lazy = isinstance(ds, LazyLoader) or (hasattr(ds, 'is_lazy')
+                                                      and ds.is_lazy)
+        self._X = None
+        self._Y = None
+
+    def __len__(self):
+        return len(self.split_inds)
+
+    def get_text_len(self, idx):
+        return self.wrapped_data.get_text_len(self.split_inds[idx])
+
+    def __getitem__(self, index):
+        return self.wrapped_data[self.split_inds[index]]
+
+    def SetTokenizer(self, tokenizer):
+        self.wrapped_data.SetTokenizer(tokenizer)
+
+    def GetTokenizer(self):
+        return self.wrapped_data.GetTokenizer()
+
+    @property
+    def X(self):
+        if self._X is None:
+            self._X = itemgetter(*self.split_inds)(self.wrapped_data.X)
+        return self._X
+
+    @property
+    def Y(self):
+        if self._Y is None:
+            self._Y = np.array(
+                itemgetter(*self.split_inds)(self.wrapped_data.Y))
+        return self._Y
+
+    def __iter__(self):
+        for idx in self.split_inds:
+            yield self.wrapped_data[idx]
+
+
+def split_ds(ds, split=None, shuffle=True, save_splits=None, load_splits=None):
+    """
+    Split a dataset into subsets given proportions of how
+    much to allocate per split. If a split is 0% returns None for that split.
+    Purpose: Useful for creating train/val/test splits
+    Arguments:
+        ds (Dataset or array-like): Data to be split.
+        split (1D array-like): proportions to split `ds`. `sum(splits) != 0`
+        shuffle (boolean): Randomly split dataset. Default: True
+        save_splits: save split indices to file
+        load_splits: load split indices from file
+    """
+    if split is None:
+        split = [.8, .2, .0]
+    split_sum = sum(split)
+    if split_sum == 0:
+        raise Exception('Split cannot sum to 0.')
+    split = np.array(split)
+    split /= split_sum
+    ds_len = len(ds)
+    inds = np.arange(ds_len)
+    if shuffle:
+        rng = np.random.RandomState(1234)
+        rng.shuffle(inds)
+    if load_splits is not None:
+        inds = np.load(load_splits)
+        assert len(inds) == ds_len
+        print_rank_0(f'Load split indices from {load_splits}')
+    elif save_splits is not None:
+        if torch.distributed.get_rank() == 0:
+            np.save(save_splits, inds)
+            print(f'Save split indices to {save_splits}')
+    start_idx = 0
+    residual_idx = 0
+    rtn_ds = [None] * len(split)
+    for i, f in enumerate(split):
+        if f != 0:
+            proportion = ds_len * split[i]
+            residual_idx += proportion % 1
+            split_ = int(int(proportion) + residual_idx)
+            split_inds = inds[start_idx:start_idx + max(split_, 1)]
+            rtn_ds[i] = SplitDataset(ds, split_inds)
+            start_idx += split_
+            residual_idx %= 1
+    return rtn_ds
+
+
+class csv_dataset(data.Dataset):
+    """
+    Class for loading datasets from csv files.
+    Purpose: Useful for loading data for unsupervised modeling or transfer tasks
+    Arguments:
+        path (str): Path to csv file with dataset.
+        tokenizer (data_utils.Tokenizer): Tokenizer to use when processing text. Default: None
+        preprocess_fn (callable): Callable that process a string into desired format.
+        delim (str): delimiter for csv. Default: ','
+        binarize_sent (bool): binarize label values to 0 or 1 if they\'re on a different scale. Default: False
+        drop_unlabeled (bool): drop rows with unlabelled values. Always fills remaining empty
+            columns with -1 (regardless if rows are dropped based on value) Default: False
+        text_key (str): key to get text from csv. Default: 'sentence'
+        label_key (str): key to get label from json dictionary. Default: 'label'
+    Attributes:
+        X (list): all strings from the csv file
+        Y (np.ndarray): labels to train with
+    """
+
+    def __init__(self,
+                 path,
+                 tokenizer=None,
+                 preprocess_fn=None,
+                 delim=',',
+                 binarize_sent=False,
+                 drop_unlabeled=False,
+                 text_key='sentence',
+                 label_key='label',
+                 **kwargs):
+        self.is_lazy = False
+        self.preprocess_fn = preprocess_fn
+        self.SetTokenizer(tokenizer)
+        self.path = path
+        self.delim = delim
+        self.text_key = text_key
+        self.label_key = label_key
+        self.drop_unlabeled = drop_unlabeled
+
+        if '.tsv' in self.path:
+            self.delim = '\t'
+
+        self.X = []
+        self.Y = []
+        try:
+            cols = [text_key]
+            if isinstance(label_key, list):
+                cols += label_key
+            else:
+                cols += [label_key]
+            data = pd.read_csv(
+                self.path, sep=self.delim, usecols=cols, encoding='latin-1')
+        except:  # noqa
+            data = pd.read_csv(
+                self.path,
+                sep=self.delim,
+                usecols=[text_key],
+                encoding='latin-1')
+
+        data = data.dropna(axis=0)
+
+        self.X = data[text_key].values.tolist()
+        try:
+            self.Y = data[label_key].values
+        except Exception as e:  # noqa
+            self.Y = np.ones(len(self.X)) * -1
+
+        if binarize_sent:
+            self.Y = binarize_labels(self.Y, hard=binarize_sent)
+
+    def SetTokenizer(self, tokenizer):
+        if tokenizer is None:
+            self.using_tokenizer = False
+            if not hasattr(self, '_tokenizer'):
+                self._tokenizer = tokenizer
+        else:
+            self.using_tokenizer = True
+            self._tokenizer = tokenizer
+
+    def GetTokenizer(self):
+        return self._tokenizer
+
+    @property
+    def tokenizer(self):
+        if self.using_tokenizer:
+            return self._tokenizer
+        return None
+
+    def __len__(self):
+        return len(self.X)
+
+    def __getitem__(self, index):
+        """process+tokenize string and return string,label,and stringlen"""
+        x = self.X[index]
+        if self.tokenizer is not None:
+            x = self.tokenizer.EncodeAsIds(x, self.preprocess_fn)
+        elif self.preprocess_fn is not None:
+            x = self.preprocess_fn(x)
+        y = self.Y[index]
+        if isinstance(y, str):
+            if self.tokenizer is not None:
+                y = self.tokenizer.EncodeAsIds(y, self.preprocess_fn)
+            elif self.preprocess_fn is not None:
+                y = self.preprocess_fn(y)
+        return {'text': x, 'length': len(x), 'label': y}
+
+    def write(self, writer_gen=None, path=None, skip_header=False):
+        """
+        given a generator of metrics for each of the data points X_i,
+            write the metrics, text, and labels to a csv file
+        """
+        if path is None:
+            path = self.path + '.results'
+        print('generating csv at ' + path)
+        with open(path, 'w') as csvfile:
+            c = csv.writer(csvfile, delimiter=self.delim)
+            if writer_gen is not None:
+                # if first item of generator is a header of what the metrics mean then write header to csv file
+                if not skip_header:
+                    header = (self.label_key, ) + tuple(
+                        next(writer_gen)) + (self.text_key, )
+                    c.writerow(header)
+                for i, row in enumerate(writer_gen):
+                    row = (self.Y[i], ) + tuple(row) + (self.X[i], )
+                    c.writerow(row)
+            else:
+                c.writerow([self.label_key, self.text_key])
+                for row in zip(self.Y, self.X):
+                    c.writerow(row)
+
+
+class json_dataset(data.Dataset):
+    """
+    Class for loading datasets from a json dump.
+    Purpose: Useful for loading data for unsupervised modeling or transfer tasks
+    Arguments:
+        path (str): path to json file with dataset.
+        tokenizer (data_utils.Tokenizer): Tokenizer to use when processing text. Default: None
+        preprocess_fn (callable): callable function that process a string into desired format.
+            Takes string, maxlen=None, encode=None as arguments. Default: process_str
+        text_key (str): key to get text from json dictionary. Default: 'sentence'
+        label_key (str): key to get label from json dictionary. Default: 'label'
+    Attributes:
+        all_strs (list): list of all strings from the dataset
+        all_labels (list): list of all labels from the dataset (if they have it)
+    """
+
+    def __init__(self,
+                 path,
+                 tokenizer=None,
+                 preprocess_fn=None,
+                 binarize_sent=False,
+                 text_key='sentence',
+                 label_key='label',
+                 loose_json=False,
+                 **kwargs):
+        self.is_lazy = False
+        self.preprocess_fn = preprocess_fn
+        self.path = path
+        self.SetTokenizer(tokenizer)
+        self.X = []
+        self.Y = []
+        self.text_key = text_key
+        self.label_key = label_key
+        self.loose_json = loose_json
+
+        for j in self.load_json_stream(self.path):
+            s = j[text_key]
+            self.X.append(s)
+            self.Y.append(j[label_key])
+
+        if binarize_sent:
+            self.Y = binarize_labels(self.Y, hard=binarize_sent)
+
+    def SetTokenizer(self, tokenizer):
+        if tokenizer is None:
+            self.using_tokenizer = False
+            if not hasattr(self, '_tokenizer'):
+                self._tokenizer = tokenizer
+        else:
+            self.using_tokenizer = True
+            self._tokenizer = tokenizer
+
+    def GetTokenizer(self):
+        return self._tokenizer
+
+    @property
+    def tokenizer(self):
+        if self.using_tokenizer:
+            return self._tokenizer
+        return None
+
+    def __getitem__(self, index):
+        """gets the index'th string from the dataset"""
+        x = self.X[index]
+        if self.tokenizer is not None:
+            x = self.tokenizer.EncodeAsIds(x, self.preprocess_fn)
+        elif self.preprocess_fn is not None:
+            x = self.preprocess_fn(x)
+        y = self.Y[index]
+        if isinstance(y, str):
+            if self.tokenizer is not None:
+                y = self.tokenizer.EncodeAsIds(y, self.preprocess_fn)
+            elif self.preprocess_fn is not None:
+                y = self.preprocess_fn(y)
+        return {'text': x, 'length': len(x), 'label': y}
+
+    def __len__(self):
+        return len(self.X)
+
+    def write(self, writer_gen=None, path=None, skip_header=False):
+        """
+        given a generator of metrics for each of the data points X_i,
+            write the metrics, text, and labels to a json file
+        """
+        if path is None:
+            path = self.path + '.results'
+
+        if writer_gen is not None:
+            # if first item of generator is a header of what the metrics mean then write header to csv file
+            def gen_helper():
+                keys = {}
+                keys[0] = self.label_key
+                if not skip_header:
+                    for idx, k in enumerate(tuple(next(writer_gen))):
+                        keys[idx + 1] = k
+                for i, row in enumerate(writer_gen):
+                    if i == 0 and skip_header:
+                        for idx, _ in enumerate(row):
+                            keys[idx + 1] = 'metric_%d' % (idx, )
+                    j = {}
+                    for idx, v in enumerate((self.Y[i], ) + tuple(row)):
+                        k = keys[idx]
+                        j[k] = v
+                    yield j
+        else:
+
+            def gen_helper():
+                for y in self.Y:
+                    j = {}
+                    j[self.label_key] = y
+                    yield j
+
+        def out_stream():
+            for i, j in enumerate(gen_helper()):
+                j[self.text_key] = self.X[i]
+                yield j
+
+        self.save_json_stream(path, out_stream())
+
+    def save_json_stream(self, save_path, json_stream):
+        if self.loose_json:
+            with open(save_path, 'w') as f:
+                for i, j in enumerate(json_stream):
+                    write_string = ''
+                    if i != 0:
+                        write_string = '\n'
+                    write_string += json.dumps(j)
+                    f.write(write_string)
+        else:
+            jsons = [j for j in json_stream]
+            json.dump(jsons, open(save_path, 'w'), separators=(',', ':'))
+
+    def load_json_stream(self, load_path):
+        if not self.loose_json:
+            jsons = json.load(open(load_path, 'r'))
+            generator = iter(jsons)
+        else:
+
+            def gen_helper():
+                with open(load_path, 'r') as f:
+                    for row in f:
+                        yield json.loads(row)
+
+            generator = gen_helper()
+
+        for j in generator:
+            if self.label_key not in j:
+                j[self.label_key] = -1
+            yield j
+
+
+class XLDataset(data.Dataset):
+
+    def __init__(self,
+                 ds,
+                 tokenizer,
+                 max_seq_len=1024,
+                 mem_len=None,
+                 sample_across_doc=True,
+                 **kwargs):
+        self.ds = ds
+        self.tokenizer = tokenizer
+        self.max_seq_len = max_seq_len
+        if mem_len is None:
+            mem_len = max_seq_len
+        self.mem_len = mem_len
+        self.sample_across_doc = sample_across_doc
+        self.indices, self.num_samples = None, None
+        if hasattr(self.ds, 'is_lazy') and self.ds.is_lazy:
+            self.is_lazy = True
+        self.init_indices()
+
+    def init_indices(self):
+        if self.is_lazy:
+            lens = np.array(
+                [self.ds.get_text_len(idx) for idx in range(len(self.ds))])
+        else:
+            lens = np.array([
+                len(d['prompt'])
+                + len(d['text']) if isinstance(d, dict) else len(d)
+                for d in self.ds
+            ])
+        self.indices = list(accumulate(lens))
+        print_rank_0(
+            f'Dataset document count {len(lens)}, token count {self.indices[-1]}'
+        )
+        self.num_samples = self.indices[-1] // self.max_seq_len + 1
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, idx):
+        tokens, targets, loss_mask, attention_mask = self.getidx(idx)
+        tokens = self.pad_seq(tokens)
+        targets = self.pad_seq(targets)
+        loss_mask = self.pad_seq(loss_mask, pad_id=0)
+        return {
+            'text': np.array(tokens),
+            'target': np.array(targets),
+            'loss_mask': np.array(loss_mask),
+            'attention_mask': np.array(attention_mask)
+        }
+
+    def getidx(self, idx):
+        tokens, targets, loss_masks = [], [], []
+        attention_mask = np.concatenate(
+            (np.zeros((self.max_seq_len, self.mem_len), dtype=np.long),
+             np.ones((self.max_seq_len, self.max_seq_len), dtype=np.long)),
+            axis=1)
+        sample_idx = bisect_right(self.indices, idx * self.max_seq_len)
+        last_end = 0 if sample_idx == 0 else self.indices[sample_idx - 1]
+        token_offset = idx * self.max_seq_len - last_end
+        if token_offset != 0:
+            history = min(self.mem_len, token_offset)
+            attention_mask[:,
+                           -self.max_seq_len - history:-self.max_seq_len] = 1
+        count = 0
+        while len(tokens) < self.max_seq_len and sample_idx < len(self.ds):
+            item = self.ds[sample_idx]
+            text, masks = item['tokens'], item['loss_masks']
+            text = text + [self.tokenizer.get_command('eos').Id]
+            end = min(
+                len(text) - 1, token_offset + self.max_seq_len - len(tokens))
+            masks = masks + [1]
+            if count > 0:
+                current = len(tokens)
+                attention_mask[current:, :current + self.mem_len] = 0
+            tokens += text[token_offset:end]
+            targets += text[token_offset + 1:end + 1]
+            loss_masks += masks[token_offset + 1:end + 1]
+            count += 1
+            sample_idx += 1
+            token_offset = 0
+        return tokens, targets, loss_masks, attention_mask
+
+    def pad_seq(self, seq, pad_id=None):
+        total_tokens = self.max_seq_len
+        num_pad_tokens = max(0, total_tokens - len(seq))
+        seq += [
+            self.tokenizer.get_command('pad').Id if pad_id is None else pad_id
+        ] * (
+            num_pad_tokens)
+        return seq
+
+
+class BlockDataset(data.Dataset):
+
+    def __init__(self,
+                 ds,
+                 tokenizer,
+                 max_seq_len=1024,
+                 sample_across_doc=True,
+                 non_sentence_start=0.0,
+                 filter_english=False,
+                 **kwargs):
+        """
+        sentence_start: the stripped article must start with a complete sentence
+        """
+        self.ds = ds
+        self.ds_len = len(self.ds)
+        self.num_samples = 1000 * self.ds_len
+        self.max_seq_len = max_seq_len
+        self.tokenizer = tokenizer
+        self.sample_across_doc = sample_across_doc
+        self.non_sentence_start = non_sentence_start
+        self.filter_english = filter_english
+        self.weighting, self.total_len = None, None
+        self.is_lazy = False
+        if self.filter_english:
+            import fasttext
+            self.model = fasttext.load_model('/mnt/lid.176.bin')
+            print_rank_0('Load language detection model')
+        if hasattr(self.ds, 'is_lazy') and self.ds.is_lazy:
+            self.is_lazy = True
+        self.init_weighting()
+
+    def init_weighting(self):
+        if self.is_lazy:
+            lens = np.array(
+                [self.ds.get_text_len(idx) for idx in range(len(self.ds))])
+        else:
+            lens = np.array([
+                len(d['text']) if isinstance(d, dict) else len(d)
+                for d in self.ds
+            ])
+        self.total_len = np.sum(lens)
+        print_rank_0(
+            f'Dataset document count {len(lens)}, token count {self.total_len}, non sentence start{self.non_sentence_start}'  # noqa
+        )
+        self.weighting = list(accumulate(lens))
+
+    def get_weighted_samples(self, np_rng):
+        while True:
+            idx = np_rng.randint(self.total_len)
+            data_idx = bisect_right(self.weighting, idx)
+            tokens, loss_mask = self.getidx(data_idx)
+            if self.filter_english:
+                text = self.tokenizer.DecodeIds(tokens[:1024])
+                lang = self.model.predict(text.replace('\n', ''))[0][0]
+                if lang == '__label__en':
+                    break
+            else:
+                break
+        return tokens, loss_mask
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, idx):
+        # init rng
+        rng = random.Random(idx)
+        rng = np.random.RandomState(
+            seed=[rng.randint(0, 2**32 - 1) for _ in range(16)])
+
+        # get possibly weighted random index from dataset
+        tokens, loss_mask = self.get_weighted_samples(rng)
+        # truncate or pad tokens
+        num_tokens = len(tokens)
+        tokens_to_strip = num_tokens - self.max_seq_len + 1
+
+        # randomly choose a position for start
+        if tokens_to_strip > 0:
+            move_count = 0
+            strip_left_tokens = rng.randint(tokens_to_strip)
+            if rng.random() > self.non_sentence_start:
+                if rng.random() < 0.5:
+                    while move_count < self.max_seq_len // 2 and strip_left_tokens > 0 and not self.contains_sentence_end(  # noqa
+                            tokens[strip_left_tokens - 1]):  # noqa
+                        strip_left_tokens -= 1
+                        move_count += 1
+                else:
+                    while move_count < self.max_seq_len // 2 and strip_left_tokens < len(
+                            tokens) and not self.contains_sentence_end(
+                                tokens[strip_left_tokens - 1]):
+                        strip_left_tokens += 1
+                        move_count += 1
+            tokens = [self.tokenizer.get_command('ENC').Id
+                      ] + tokens[strip_left_tokens:]
+            loss_mask = [0] + loss_mask[strip_left_tokens:]
+            if len(tokens) == 2 and tokens[1] == self.tokenizer.get_command(
+                    'eos').Id:
+                tokens, loss_mask = [], []
+            tokens, loss_mask = self.right_strip_seq(tokens, loss_mask,
+                                                     self.max_seq_len)
+        else:
+            tokens = [self.tokenizer.get_command('ENC').Id] + tokens
+            loss_mask = [0] + loss_mask
+            # Sample multiple documents
+            if self.sample_across_doc:
+                while len(tokens) < self.max_seq_len:
+                    new_tokens, new_loss_mask = self.get_weighted_samples(rng)
+                    new_tokens = [self.tokenizer.get_command('ENC').Id
+                                  ] + new_tokens
+                    new_loss_mask = [0] + new_loss_mask
+                    is_last = len(new_tokens) >= self.max_seq_len - len(tokens)
+                    new_tokens, new_loss_mask = self.right_strip_seq(
+                        new_tokens, new_loss_mask,
+                        self.max_seq_len - len(tokens))
+                    tokens += new_tokens
+                    loss_mask += new_loss_mask
+                    if is_last:
+                        break
+        return {'text': np.array(tokens), 'loss_mask': np.array(loss_mask)}
+
+    def right_strip_seq(self, tokens, loss_mask, seq_length):
+        strip_right_tokens = len(tokens) - seq_length
+        if strip_right_tokens > 0:
+            while strip_right_tokens < len(
+                    tokens) - 1 and not self.contains_sentence_end(
+                        tokens[-strip_right_tokens - 1]):
+                strip_right_tokens += 1
+            if len(tokens) - strip_right_tokens < seq_length // 2:
+                strip_right_tokens = len(tokens) - seq_length
+            tokens = tokens[:-strip_right_tokens]
+            loss_mask = loss_mask[:-strip_right_tokens]
+        return tokens, loss_mask
+
+    def getidx(self, data_idx):
+        data = self.ds[data_idx]
+        tokens, loss_masks = data['tokens'], data['loss_masks']
+        tokens = tokens + [self.tokenizer.get_command('eos').Id]
+        loss_masks = loss_masks + [1]
+        return tokens, loss_masks
+
+    def pad_seq(self, seq, pad_id=None):
+        total_tokens = self.max_seq_len
+        num_pad_tokens = max(0, total_tokens - len(seq))
+        seq += [
+            self.tokenizer.get_command('pad').Id if pad_id is None else pad_id
+        ] * (
+            num_pad_tokens)
+        return seq
+
+    # TODO: rewrite this function for chinese
+    def contains_sentence_end(self, tok):
+        tok = self.tokenizer.IdToToken(tok)
+        if '.' in tok:
+            return True
+        if '?' in tok:
+            return True
+        if '!' in tok:
+            return True
+        if ';' in tok:
+            return True
+        if ':' in tok:
+            return True
+        if '\n' in tok:
+            return True
+        return False
+
+
+class GPT2Dataset(data.Dataset):
+
+    def __init__(self,
+                 ds,
+                 tokenizer,
+                 max_seq_len=1024,
+                 num_samples=None,
+                 weighted=True,
+                 sample_across_doc=True,
+                 random_across_doc_sampling=True,
+                 sentence_start=False,
+                 **kwargs):
+        """
+        sentence_start: the stripped article must start with a complete sentence
+        """
+        self.ds = ds
+        self.ds_len = len(self.ds)
+        self.num_samples = num_samples
+        if num_samples is None:
+            self.num_samples = 1000 * self.ds_len
+        self.max_seq_len = max_seq_len
+        self.tokenizer = tokenizer
+        self.weighted = weighted
+        self.sample_across_doc = sample_across_doc
+        self.random_across_doc_sampling = random_across_doc_sampling
+        self.sentence_start = sentence_start
+        self.weighting, self.total_len = None, None
+        self.is_lazy = False
+        if hasattr(self.ds, 'is_lazy') and self.ds.is_lazy:
+            self.is_lazy = True
+        self.init_weighting()
+
+    def init_weighting(self):
+        if self.weighted:
+            if self.is_lazy:
+                lens = np.array(
+                    [self.ds.get_text_len(idx) for idx in range(len(self.ds))])
+            else:
+                lens = np.array([
+                    len(d['text']) if isinstance(d, dict) else len(d)
+                    for d in self.ds
+                ])
+            self.total_len = np.sum(lens)
+            print_rank_0(
+                f'Dataset document count {len(lens)}, token count {self.total_len}'
+            )
+            self.weighting = list(accumulate(lens))
+        else:
+            self.weighting = None
+
+    def get_weighted_samples(self, np_rng):
+        if self.weighting is not None:
+            idx = np_rng.randint(self.total_len)
+            return bisect_right(self.weighting, idx)
+        else:
+            return np_rng.randint(self.ds_len)
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, idx):
+        # init rng
+        rng = random.Random(idx)
+        rng = np.random.RandomState(
+            seed=[rng.randint(0, 2**32 - 1) for _ in range(16)])
+
+        # get possibly weighted random index from dataset
+        data_idx = self.get_weighted_samples(rng)
+        #        data_idx = rng.choice(self.ds_len, p=self.weighting)
+        tokens, loss_mask = self.getidx(data_idx)
+
+        # truncate or pad tokens
+        num_tokens = len(tokens)
+        tokens_to_strip = num_tokens - self.max_seq_len - 1
+
+        # randomly choose a position for start
+        if tokens_to_strip > 0:
+            strip_left_tokens = rng.randint(tokens_to_strip + 1)
+            tokens = tokens[strip_left_tokens:]
+            loss_mask = loss_mask[strip_left_tokens:]
+            # if self.sentence_start:
+            #     token_copy = list(tokens)
+            #     not_done = True
+            #     while (len(token_copy) > 0) and not_done:
+            #         tok = token_copy.pop(0)
+            #         if self.contains_sentence_end(tok):
+            #             tokens = token_copy
+            #             not_done = False
+            strip_right_rokens = len(tokens) - self.max_seq_len - 1
+            if strip_right_rokens > 0:
+                tokens = tokens[:-strip_right_rokens]
+                loss_mask = loss_mask[:-strip_right_rokens]
+        # Sample multiple documents
+        if self.sample_across_doc:
+            while (len(tokens) < (self.max_seq_len + 1)):
+                if self.random_across_doc_sampling:
+                    data_idx = self.get_weighted_samples(rng)
+                else:
+                    data_idx = (data_idx + 1) % self.ds_len
+                new_tokens, new_loss_mask = self.getidx(data_idx)
+                tokens += new_tokens
+                loss_mask += new_loss_mask
+            tokens = tokens[:(self.max_seq_len + 1)]
+            loss_mask = loss_mask[:(self.max_seq_len + 1)]
+
+        tokens = self.pad_seq(tokens)
+        loss_mask = self.pad_seq(loss_mask, pad_id=0)
+        return {'text': np.array(tokens), 'loss_mask': np.array(loss_mask)}
+
+    def getidx(self, data_idx):
+        data = self.ds[data_idx]
+        tokens, loss_masks = data['tokens'], data['loss_masks']
+        tokens = tokens + [self.tokenizer.get_command('eos').Id]
+        loss_masks = loss_masks + [1]
+        return tokens, loss_masks
+
+    def pad_seq(self, seq, pad_id=None):
+        total_tokens = self.max_seq_len + 1
+        num_pad_tokens = max(0, total_tokens - len(seq))
+        seq += [
+            self.tokenizer.get_command('pad').Id if pad_id is None else pad_id
+        ] * (
+            num_pad_tokens)
+        return seq
+
+    # TODO: rewrite this function for chinese
+    def contains_sentence_end(self, tok):
+        tok = self.tokenizer.IdToToken(tok)
+        if '.' in tok:
+            return True
+        if '?' in tok:
+            return True
+        if '!' in tok:
+            return True
+        return False
+
+
+class BertSentencepairDataset(data.Dataset):
+    """
+    Dataset containing sentencepairs for BERT training. Each index corresponds to a randomly generated sentence pair.
+    Arguments:
+        ds (Dataset or array-like): data corpus to use for training
+        max_seq_len (int): maximum sequence length to use for a sentence pair
+        mask_lm_prob (float): proportion of tokens to mask for masked LM
+        max_preds_per_seq (int): Maximum number of masked tokens per sentence pair. Default: math.ceil(max_seq_len*mask_lm_prob/10)*10
+        short_seq_prob (float): Proportion of sentence pairs purposefully shorter than max_seq_len
+        dataset_size (int): number of random sentencepairs in the dataset. Default: len(ds)*(len(ds)-1)
+
+    """ # noqa
+
+    def __init__(self,
+                 ds,
+                 max_seq_len=512,
+                 mask_lm_prob=.15,
+                 max_preds_per_seq=None,
+                 short_seq_prob=.01,
+                 dataset_size=None,
+                 presplit_sentences=False,
+                 weighted=True,
+                 **kwargs):
+        self.ds = ds
+        self.ds_len = len(self.ds)
+        self.tokenizer = self.ds.GetTokenizer()
+        self.vocab_words = list(self.tokenizer.text_token_vocab.values())
+        self.ds.SetTokenizer(None)
+        self.max_seq_len = max_seq_len
+        self.mask_lm_prob = mask_lm_prob
+        if max_preds_per_seq is None:
+            max_preds_per_seq = math.ceil(max_seq_len * mask_lm_prob / 10) * 10
+        self.max_preds_per_seq = max_preds_per_seq
+        self.short_seq_prob = short_seq_prob
+        self.dataset_size = dataset_size
+        if self.dataset_size is None:
+            self.dataset_size = self.ds_len * (self.ds_len - 1)
+        self.presplit_sentences = presplit_sentences
+        if not self.presplit_sentences:
+            nltk.download('punkt', download_dir='./nltk')
+        self.weighted = weighted
+        self.get_weighting()
+
+    def get_weighting(self):
+        if self.weighted:
+            if hasattr(self.ds, 'is_lazy') and self.ds.is_lazy:
+                lens = np.array(self.ds.lens)
+            else:
+                lens = np.array([
+                    len(d['text']) if isinstance(d, dict) else len(d)
+                    for d in self.ds
+                ])
+            self.total_len = np.sum(lens)
+            self.weighting = list(accumulate(lens))
+        else:
+            self.weighting = None
+
+    def get_weighted_samples(self, np_rng):
+        if self.weighting is not None:
+            idx = np_rng.randint(self.total_len)
+            return bisect_right(self.weighting, idx)
+        else:
+            return np_rng.randint(self.ds_len)
+
+    def __len__(self):
+        return self.dataset_size
+
+    def __getitem__(self, idx):
+        # get rng state corresponding to index (allows deterministic random pair)
+        rng = random.Random(idx)
+        np_rng = np.random.RandomState(
+            seed=[rng.randint(0, 2**32 - 1) for _ in range(16)])
+        # get seq length
+        target_seq_length = self.max_seq_len
+        short_seq = False  # noqa
+        if rng.random() < self.short_seq_prob:
+            target_seq_length = rng.randint(2, target_seq_length)
+            short_seq = True  # noqa
+
+        # get sentence pair and label
+        is_random_next = None
+        lena = 0
+        lenb = 0
+        while (is_random_next is None) or (lena < 1) or (lenb < 1):
+            tokensa, tokensb, is_random_next = self.create_random_sentencepair(
+                target_seq_length, rng, np_rng)
+            lena = len(tokensa[0])
+            lenb = len(tokensb[0])
+
+        # truncate sentence pair to max_seq_len
+        tokensa, tokensb = self.truncate_seq_pair(tokensa, tokensb,
+                                                  self.max_seq_len, rng)
+        # join sentence pair, mask, and pad
+        tokens, mask, mask_labels, pad_mask = self.create_masked_lm_predictions(
+            tokensa, tokensb, self.mask_lm_prob, self.max_preds_per_seq,
+            self.vocab_words, rng)
+        sample = {
+            'text': np.array(tokens[0]),
+            'types': np.array(tokens[1]),
+            'is_random': int(is_random_next),
+            'mask': np.array(mask),
+            'mask_labels': np.array(mask_labels),
+            'pad_mask': np.array(pad_mask)
+        }
+        return sample
+
+    def sentence_split(self, document):
+        """split document into sentences"""
+        lines = document.split('\n')
+        if self.presplit_sentences:
+            return [line for line in lines if line]
+        rtn = []
+        for line in lines:
+            if line != '':
+                rtn.extend(tokenize.sent_tokenize(line))
+        return rtn
+
+    def sentence_tokenize(self,
+                          sent,
+                          sentence_num=0,
+                          beginning=False,
+                          ending=False):
+        """tokenize sentence and get token types"""
+        tokens = self.tokenizer.EncodeAsIds(sent).tokenization
+        str_type = 'str' + str(sentence_num)
+        token_types = [self.tokenizer.get_type(str_type).Id] * len(tokens)
+        return tokens, token_types
+
+    def get_doc(self, idx):
+        """gets text of document corresponding to idx"""
+        rtn = self.ds[idx]
+        if isinstance(rtn, dict):
+            rtn = rtn['text']
+        return rtn
+
+    def create_random_sentencepair(self, target_seq_length, rng, np_rng):
+        """
+        fetches a random sentencepair corresponding to rng state similar to
+        https://github.com/google-research/bert/blob/master/create_pretraining_data.py#L248-L294
+        """
+        is_random_next = None
+
+        curr_strs = []
+        curr_str_types = []
+        curr_len = 0
+
+        while curr_len < 1:
+            curr_len = 0
+            doc_a = None
+            while doc_a is None:
+                if self.weighted:
+                    # doc_a_idx = np_rng.choice(self.ds_len, p=self.weighting)
+                    doc_a_idx = self.get_weighted_samples(np_rng)
+                else:
+                    doc_a_idx = rng.randint(0, self.ds_len - 1)
+                doc_a = self.sentence_split(self.get_doc(doc_a_idx))
+                if not doc_a:
+                    doc_a = None
+
+            random_start_a = rng.randint(0, len(doc_a) - 1)
+            while random_start_a < len(doc_a):
+                sentence = doc_a[random_start_a]
+                sentence, sentence_types = self.sentence_tokenize(
+                    sentence, 0, random_start_a == 0,
+                    random_start_a == len(doc_a))
+                curr_strs.append(sentence)
+                curr_str_types.append(sentence_types)
+                curr_len += len(sentence)
+                if random_start_a == len(
+                        doc_a) - 1 or curr_len >= target_seq_length:
+                    break
+                random_start_a = (random_start_a + 1)
+
+        if curr_strs:
+            num_a = 1
+            if len(curr_strs) >= 2:
+                num_a = rng.randint(0, len(curr_strs))
+
+            tokens_a = []
+            token_types_a = []
+            for j in range(num_a):
+                tokens_a.extend(curr_strs[j])
+                token_types_a.extend(curr_str_types[j])
+
+            tokens_b = []
+            token_types_b = []
+            is_random_next = False
+            if len(curr_strs) == 1 or rng.random() < 0.5:
+                is_random_next = True
+                target_b_length = target_seq_length - len(tokens_a)
+                b_len = 0
+                while b_len < 1:
+                    doc_b = None
+                    while doc_b is None:
+                        doc_b_idx = rng.randint(0, self.ds_len - 2)
+                        doc_b_idx += int(doc_b_idx >= doc_a_idx)
+
+                        doc_b = self.sentence_split(self.get_doc(doc_b_idx))
+                        if not doc_b:
+                            doc_b = None
+
+                    random_start_b = rng.randint(0, len(doc_b) - 1)
+                    while random_start_b < len(doc_b):
+                        sentence_b = doc_b[random_start_b]
+                        new_b_tokens, new_b_types = self.sentence_tokenize(
+                            sentence_b, 1, random_start_b == 0,
+                            random_start_b == len(doc_b))
+                        b_len += len(new_b_tokens)
+                        tokens_b.extend(new_b_tokens)
+                        token_types_b.extend(new_b_types)
+                        if len(tokens_b) >= target_b_length:
+                            break
+                        random_start_b = (random_start_b + 1)
+            else:
+                is_random_next = False
+                for j in range(num_a, len(curr_strs)):
+                    tokens_b.extend(curr_strs[j])
+                    token_types_b.extend(curr_str_types[j])
+
+        return (tokens_a, token_types_a), (tokens_b,
+                                           token_types_b), is_random_next
+
+    def truncate_seq_pair(self, a, b, max_seq_len, rng):
+        """
+        Truncate sequence pair according to original BERT implementation:
+        https://github.com/google-research/bert/blob/master/create_pretraining_data.py#L391
+        """
+        tokens_a, token_types_a = a
+        tokens_b, token_types_b = b
+        max_num_tokens = max_seq_len - 3
+        while True:
+            len_a = len(tokens_a)
+            len_b = len(tokens_b)
+            total_length = len_a + len_b
+            if total_length <= max_num_tokens:
+                break
+            if len(tokens_a) > len(tokens_b):
+                trunc_tokens = tokens_a
+                trunc_types = token_types_a
+            else:
+                trunc_tokens = tokens_b
+                trunc_types = token_types_b
+
+            assert len(trunc_tokens) >= 1
+
+            if rng.random() < 0.5:
+                trunc_tokens.pop(0)
+                trunc_types.pop(0)
+            else:
+                trunc_tokens.pop()
+                trunc_types.pop()
+        return (tokens_a, token_types_a), (tokens_b, token_types_b)
+
+    def mask_token(self, idx, tokens, types, vocab_words, rng):
+        """
+        helper function to mask `idx` token from `tokens` according to
+        section 3.3.1 of https://arxiv.org/pdf/1810.04805.pdf
+        """
+        label = tokens[idx]
+        if rng.random() < 0.8:
+            new_label = self.tokenizer.get_command('MASK').Id
+        else:
+            if rng.random() < 0.5:
+                new_label = label
+            else:
+                new_label = rng.choice(vocab_words)
+
+        tokens[idx] = new_label
+
+        return label
+
+    def pad_seq(self, seq):
+        """helper function to pad sequence pair"""
+        num_pad = max(0, self.max_seq_len - len(seq))
+        pad_mask = [0] * len(seq) + [1] * num_pad
+        seq += [self.tokenizer.get_command('pad').Id] * num_pad
+        return seq, pad_mask
+
+    def create_masked_lm_predictions(self, a, b, mask_lm_prob,
+                                     max_preds_per_seq, vocab_words, rng):
+        """
+        Mask sequence pair for BERT training according to:
+        https://github.com/google-research/bert/blob/master/create_pretraining_data.py#L338
+        """
+        tokens_a, token_types_a = a
+        tokens_b, token_types_b = b
+        tokens = [self.tokenizer.get_command('ENC').Id] + tokens_a + [
+            self.tokenizer.get_command('sep').Id
+        ] + tokens_b + [self.tokenizer.get_command('sep').Id]
+        token_types = [token_types_a[0]] + token_types_a + [
+            token_types_a[0]
+        ] + token_types_b + [token_types_b[0]]
+
+        len_a = len(tokens_a)
+        len_b = len(tokens_b)
+
+        cand_indices = [idx + 1 for idx in range(len_a)
+                        ] + [idx + 2 + len_a for idx in range(len_b)]
+
+        rng.shuffle(cand_indices)
+
+        output_tokens, pad_mask = self.pad_seq(list(tokens))
+        output_types, _ = self.pad_seq(list(token_types))
+
+        num_to_predict = min(max_preds_per_seq,
+                             max(1, int(round(len(tokens) * mask_lm_prob))))
+
+        mask = [0] * len(output_tokens)
+        mask_labels = [-1] * len(output_tokens)
+
+        for idx in sorted(cand_indices[:num_to_predict]):
+            mask[idx] = 1
+            label = self.mask_token(idx, output_tokens, output_types,
+                                    vocab_words, rng)
+            mask_labels[idx] = label
+
+        return (output_tokens, output_types), mask, mask_labels, pad_mask
diff --git a/modelscope/models/nlp/mglm/data_utils/extraction.py b/modelscope/models/nlp/mglm/data_utils/extraction.py
new file mode 100644
index 00000000..53027e4f
--- /dev/null
+++ b/modelscope/models/nlp/mglm/data_utils/extraction.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2022 Zhipu.AI
+
+import glob
+import os
+
+import json
+import nltk
+
+nltk.download('punkt')
+
+
+class NLTKSegmenter:
+
+    def __init(self):
+        pass
+
+    @staticmethod
+    def segment_string(article):
+        return nltk.tokenize.sent_tokenize(article)
+
+
+wiki_path = 'data/extracted'
+output_path = 'formatted/wiki-key.txt'
+segmenter = NLTKSegmenter()
+with open(output_path, 'w') as output:
+    for dirname in glob.glob(os.path.join(wiki_path, '*'), recursive=False):
+        for filename in glob.glob(
+                os.path.join(dirname, 'wiki_*'), recursive=True):
+            print(filename)
+            article_lines = []
+            article_open = False
+            with open(filename, mode='r', newline='\n') as file:
+                for line in file:
+                    line = line.rstrip()
+                    if '<doc id=' in line:
+                        article_open = True
+                    elif '</doc>' in line:
+                        key_sentences, contents = [], []
+                        key, content = None, []
+                        for sentences in article_lines[1:]:
+                            if len(sentences) > 1:
+                                if key:
+                                    if len(content) > 0 or len(contents) == 0:
+                                        key_sentences.append(key)
+                                        contents.append(content)
+                                    else:
+                                        contents[-1].append(key)
+                                    key, content = None, []
+                                key_sentences.append(sentences[0])
+                                contents.append(sentences[1:])
+                            elif len(sentences) > 0:
+                                if key:
+                                    content.append(sentences[0])
+                                else:
+                                    key = sentences[0]
+                        if key:
+                            if len(content) > 0 or len(contents) == 0:
+                                key_sentences.append(key)
+                                contents.append(content)
+                            else:
+                                contents[-1].append(key)
+                        contents = [' '.join(content) for content in contents]
+                        article = {'key': key_sentences, 'content': contents}
+                        output.write(json.dumps(article))
+                        output.write('\n')
+                        article_open = False
+                        article_lines = []
+                    else:
+                        if article_open and line:
+                            sentences = segmenter.segment_string(line)
+                            article_lines.append(sentences)
diff --git a/modelscope/models/nlp/mglm/data_utils/file_utils.py b/modelscope/models/nlp/mglm/data_utils/file_utils.py
new file mode 100755
index 00000000..794e127a
--- /dev/null
+++ b/modelscope/models/nlp/mglm/data_utils/file_utils.py
@@ -0,0 +1,256 @@
+# Modified by Zhipu.AI
+# This file is provided as is from:
+#   https://github.com/huggingface/pytorch-pretrained-BERT
+# Please refer to their repository for copyright.
+"""
+Utilities for working with the local dataset cache.
+This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
+Copyright by the AllenNLP authors.
+"""
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+import logging
+import os
+import shutil
+import sys
+import tempfile
+from functools import wraps
+from hashlib import sha256
+from io import open
+from urllib.parse import urlparse
+
+import boto3
+import json
+import requests
+from botocore.exceptions import ClientError
+from tqdm import tqdm
+
+try:
+    from pathlib import Path
+    PYTORCH_PRETRAINED_BERT_CACHE = Path(
+        os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
+                  Path.home() / '.pytorch_pretrained_bert'))
+except (AttributeError, ImportError):
+    PYTORCH_PRETRAINED_BERT_CACHE = os.getenv(
+        'PYTORCH_PRETRAINED_BERT_CACHE',
+        os.path.join(os.path.expanduser('~'), '.pytorch_pretrained_bert'))
+
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+
+
+def url_to_filename(url, etag=None):
+    """
+    Convert `url` into a hashed filename in a repeatable way.
+    If `etag` is specified, append its hash to the url's, delimited
+    by a period.
+    """
+    url_bytes = url.encode('utf-8')
+    url_hash = sha256(url_bytes)
+    filename = url_hash.hexdigest()
+
+    if etag:
+        etag_bytes = etag.encode('utf-8')
+        etag_hash = sha256(etag_bytes)
+        filename += '.' + etag_hash.hexdigest()
+
+    return filename
+
+
+def filename_to_url(filename, cache_dir=None):
+    """
+    Return the url and etag (which may be ``None``) stored for `filename`.
+    Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
+    """
+    if cache_dir is None:
+        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
+    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+
+    cache_path = os.path.join(cache_dir, filename)
+    if not os.path.exists(cache_path):
+        raise EnvironmentError('file {} not found'.format(cache_path))
+
+    meta_path = cache_path + '.json'
+    if not os.path.exists(meta_path):
+        raise EnvironmentError('file {} not found'.format(meta_path))
+
+    with open(meta_path, encoding='utf-8') as meta_file:
+        metadata = json.load(meta_file)
+    url = metadata['url']
+    etag = metadata['etag']
+
+    return url, etag
+
+
+def cached_path(url_or_filename, cache_dir=None):
+    """
+    Given something that might be a URL (or might be a local path),
+    determine which. If it's a URL, download the file and cache it, and
+    return the path to the cached file. If it's already a local path,
+    make sure the file exists and then return the path.
+    """
+    if cache_dir is None:
+        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
+    if sys.version_info[0] == 3 and isinstance(url_or_filename, Path):
+        url_or_filename = str(url_or_filename)
+    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+
+    parsed = urlparse(url_or_filename)
+
+    if parsed.scheme in ('http', 'https', 's3'):
+        # URL, so get it from the cache (downloading if necessary)
+        return get_from_cache(url_or_filename, cache_dir)
+    elif os.path.exists(url_or_filename):
+        # File, and it exists.
+        return url_or_filename
+    elif parsed.scheme == '':
+        # File, but it doesn't exist.
+        raise EnvironmentError('file {} not found'.format(url_or_filename))
+    else:
+        # Something unknown
+        raise ValueError(
+            'unable to parse {} as a URL or as a local path'.format(
+                url_or_filename))
+
+
+def split_s3_path(url):
+    """Split a full s3 path into the bucket name and path."""
+    parsed = urlparse(url)
+    if not parsed.netloc or not parsed.path:
+        raise ValueError('bad s3 path {}'.format(url))
+    bucket_name = parsed.netloc
+    s3_path = parsed.path
+    # Remove '/' at beginning of path.
+    if s3_path.startswith('/'):
+        s3_path = s3_path[1:]
+    return bucket_name, s3_path
+
+
+def s3_request(func):
+    """
+    Wrapper function for s3 requests in order to create more helpful error
+    messages.
+    """
+
+    @wraps(func)
+    def wrapper(url, *args, **kwargs):
+        try:
+            return func(url, *args, **kwargs)
+        except ClientError as exc:
+            if int(exc.response['Error']['Code']) == 404:
+                raise EnvironmentError('file {} not found'.format(url))
+            else:
+                raise
+
+    return wrapper
+
+
+@s3_request
+def s3_etag(url):
+    """Check ETag on S3 object."""
+    s3_resource = boto3.resource('s3')
+    bucket_name, s3_path = split_s3_path(url)
+    s3_object = s3_resource.Object(bucket_name, s3_path)
+    return s3_object.e_tag
+
+
+@s3_request
+def s3_get(url, temp_file):
+    """Pull a file directly from S3."""
+    s3_resource = boto3.resource('s3')
+    bucket_name, s3_path = split_s3_path(url)
+    s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)
+
+
+def http_get(url, temp_file):
+    req = requests.get(url, stream=True)
+    content_length = req.headers.get('Content-Length')
+    total = int(content_length) if content_length is not None else None
+    progress = tqdm(unit='B', total=total)
+    for chunk in req.iter_content(chunk_size=1024):
+        if chunk:  # filter out keep-alive new chunks
+            progress.update(len(chunk))
+            temp_file.write(chunk)
+    progress.close()
+
+
+def get_from_cache(url, cache_dir=None):
+    """
+    Given a URL, look for the corresponding dataset in the local cache.
+    If it's not there, download it. Then return the path to the cached file.
+    """
+    if cache_dir is None:
+        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
+    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+
+    if not os.path.exists(cache_dir):
+        os.makedirs(cache_dir)
+
+    # Get eTag to add to filename, if it exists.
+    if url.startswith('s3://'):
+        etag = s3_etag(url)
+    else:
+        response = requests.head(url, allow_redirects=True)
+        if response.status_code != 200:
+            raise IOError(
+                'HEAD request failed for url {} with status code {}'.format(
+                    url, response.status_code))
+        etag = response.headers.get('ETag')
+
+    filename = url_to_filename(url, etag)
+
+    # get cache path to put the file
+    cache_path = os.path.join(cache_dir, filename)
+
+    if not os.path.exists(cache_path):
+        # Download to temporary file, then copy to cache dir once finished.
+        # Otherwise you get corrupt cache entries if the download gets interrupted.
+        with tempfile.NamedTemporaryFile() as temp_file:
+            logger.info('%s not found in cache, downloading to %s', url,
+                        temp_file.name)
+
+            # GET file object
+            if url.startswith('s3://'):
+                s3_get(url, temp_file)
+            else:
+                http_get(url, temp_file)
+
+            # we are copying the file before closing it, so flush to avoid truncation
+            temp_file.flush()
+            # shutil.copyfileobj() starts at the current position, so go to the start
+            temp_file.seek(0)
+
+            logger.info('copying %s to cache at %s', temp_file.name,
+                        cache_path)
+            with open(cache_path, 'wb') as cache_file:
+                shutil.copyfileobj(temp_file, cache_file)
+
+            logger.info('creating metadata file for %s', cache_path)
+            meta = {'url': url, 'etag': etag}
+            meta_path = cache_path + '.json'
+            with open(meta_path, 'w', encoding='utf-8') as meta_file:
+                json.dump(meta, meta_file)
+
+            logger.info('removing temp file %s', temp_file.name)
+
+    return cache_path
+
+
+def read_set_from_file(filename):
+    '''
+    Extract a de-duped collection (set) of text from a file.
+    Expected file format is one item per line.
+    '''
+    collection = set()
+    with open(filename, 'r', encoding='utf-8') as file_:
+        for line in file_:
+            collection.add(line.rstrip())
+    return collection
+
+
+def get_file_extension(path, dot=True, lower=True):
+    ext = os.path.splitext(path)[1]
+    ext = ext if dot else ext[1:]
+    return ext.lower() if lower else ext
diff --git a/modelscope/models/nlp/mglm/data_utils/lazy_loader.py b/modelscope/models/nlp/mglm/data_utils/lazy_loader.py
new file mode 100644
index 00000000..77a77a8a
--- /dev/null
+++ b/modelscope/models/nlp/mglm/data_utils/lazy_loader.py
@@ -0,0 +1,286 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""utils for loading text from disk"""
+import mmap
+import os
+import pickle as pkl
+import time
+from itertools import accumulate
+
+import numpy as np
+import torch
+from torch.multiprocessing import Lock
+
+
+def get_lazy_path(path):
+    """
+    Gets directory path where lazy files are stored.
+    """
+    return os.path.splitext(path)[0] + '.lazy'
+
+
+def exists_lazy(path, data_type='data'):
+    """
+    Check if we've already made a lazy version of this file for the `data_type` field.
+    """
+    if not os.path.exists(get_lazy_path(path)):
+        return False
+    contents = os.listdir(get_lazy_path(path))
+    if data_type not in contents:
+        return False
+    if data_type + '.len.pkl' not in contents:
+        return False
+    return True
+
+
+def get_scatter_path(path, scatter_rank):
+    path = os.path.splitext(path)[0] + '.scatter'
+    scatter_path = os.path.join(path, str(scatter_rank))
+    return scatter_path
+
+
+def exists_scatter(path, scatter_num=64, data_type='data'):
+    for i in range(scatter_num):
+        scatter_path = get_scatter_path(path, scatter_rank=i)
+        if not exists_lazy(scatter_path, data_type=data_type):
+            return False
+    return True
+
+
+class LazyWriter:
+
+    def __init__(self,
+                 path,
+                 data_type,
+                 is_array=False,
+                 array_data_type=np.int32):
+        lazypath = get_lazy_path(path)
+        if not os.path.exists(lazypath):
+            os.makedirs(lazypath)
+        self.datapath = os.path.join(lazypath, data_type)
+        self.lenpath = os.path.join(lazypath, data_type + '.len.pkl')
+        self.array_data_type = array_data_type
+        self.output = open(self.datapath, 'wb')
+        self.lengths = []
+        self.is_array = is_array
+
+    @staticmethod
+    def get_len_path(path, data_type):
+        lazypath = get_lazy_path(path)
+        return os.path.join(lazypath, data_type + '.len.pkl')
+
+    def write(self, s):
+        if isinstance(s, dict):
+            s = s['text']
+        if self.is_array:
+            encoded = np.array(
+                s, dtype=self.array_data_type).tobytes(order='C')
+            self.output.write(encoded)
+            self.lengths.append(len(s))
+        else:
+            encoded = s.encode('utf-8')
+            self.output.write(encoded)
+            self.lengths.append(len(encoded))
+
+    def close(self):
+        self.output.close()
+        with open(self.lenpath, 'wb') as f:
+            pkl.dump(self.lengths, f)
+
+
+def split_strings(strings, start, chr_lens):
+    """
+    Split strings based on string lengths and given start.
+    """
+    return [
+        strings[i - start:j - start]
+        for i, j in zip([start] + chr_lens[:-1], chr_lens)
+    ]
+
+
+class ProcessorTokenizer:
+    """
+    callable class that runs a preprocessing, as well as tokenization step,
+    on input text.
+    """
+
+    def __init__(self, tokenizer, process_fn=None):
+        self.tokenizer = tokenizer
+        self.process_fn = process_fn
+
+    def __call__(self, string):
+        if self.tokenizer is not None:
+            string = self.tokenizer(string, process_fn=self.process_fn)
+        elif self.process_fn is not None:
+            string = self.process_fn(string)
+        return string
+
+
+class LazyLoader(object):
+    """
+    Arguments:
+        path: path to directory where array entries are concatenated into one big string file
+            and the .len file are located
+        data_type (str): Some datsets have multiple fields that are stored in different paths.
+            `data_type` specifies which of these fields to load in this class
+        mem_map  (boolean): Specifies whether to memory map file `path`
+        map_fn (callable): Fetched strings are passed through map_fn before being returned.
+
+    Example of lazy loader directory structure:
+    file.json
+    file.lazy/
+        data_type1
+        data_type1.len.pkl
+        data_type2
+        data_type2.len.pkl
+    """
+
+    def __init__(self,
+                 path,
+                 data_type='data',
+                 mem_map=False,
+                 map_fn=None,
+                 is_array=False,
+                 array_data_type=np.int32,
+                 load_memory=False,
+                 half_load=False):
+        lazypath = get_lazy_path(path)
+        datapath = os.path.join(lazypath, data_type)
+        # get file where array entries are concatenated into one big string
+        self._file = open(datapath, 'rb')
+        self.file = self._file
+        self.is_array = is_array
+        self.array_data_type = array_data_type
+        # memory map file if necessary
+        lenpath = os.path.join(lazypath, data_type + '.len.pkl')
+        self.lens = pkl.load(open(lenpath, 'rb'))
+        if half_load:
+            self.lens = self.lens[:2 * len(self.lens) // 3]
+        self.ends = list(accumulate(self.lens))
+        self.dumb_ends = list(self.ends)
+        self.mem_map = mem_map
+        self.load_memory = load_memory
+        if self.load_memory:
+            data_type_size = np.dtype(self.array_data_type).itemsize
+            if half_load:
+                self.file = self.file.read(sum(self.lens) * data_type_size)
+            else:
+                self.file = self.file.read()
+            self.file = np.ndarray(
+                shape=(len(self.file) // data_type_size, ),
+                dtype=array_data_type,
+                buffer=self.file,
+                order='C')
+        elif self.mem_map:
+            if is_array:
+                if self.ends[-1] == 0:
+                    self.file = np.array([], dtype=array_data_type)
+                else:
+                    self.file = np.memmap(
+                        self.file, dtype=array_data_type, mode='r', order='C')
+            else:
+                if self.ends[-1] == 0:
+                    self.file = bytearray()
+                else:
+                    self.file = mmap.mmap(
+                        self.file.fileno(), 0, prot=mmap.PROT_READ)
+        self.read_lock = Lock()
+        self.process_fn = map_fn
+        self.map_fn = map_fn
+        self._tokenizer = None
+        self.is_lazy = True
+
+    def SetTokenizer(self, tokenizer):
+        """
+        logic to set and remove (set to None) tokenizer.
+        combines preprocessing/tokenization into one callable.
+        """
+        if tokenizer is None:
+            if not hasattr(self, '_tokenizer'):
+                self._tokenizer = tokenizer
+        else:
+            self._tokenizer = tokenizer
+        self.map_fn = ProcessorTokenizer(tokenizer, self.process_fn)
+
+    def GetTokenizer(self):
+        return self._tokenizer
+
+    def __getitem__(self, index):
+        """
+        read file and splice strings based on string ending array `self.ends`
+        """
+        if not isinstance(index, slice):
+            if index == 0:
+                start = 0
+            else:
+                start = self.ends[index - 1]
+            end = self.ends[index]
+            rtn = self.file_read(start, end)
+            if self.map_fn is not None:
+                rtn = self.map_fn(rtn)
+        else:
+            # if slice, fetch strings with 1 diskread and then splice in memory
+            chr_lens = self.ends[index]
+            if index.start == 0 or index.start is None:
+                start = 0
+            else:
+                start = self.ends[index.start - 1]
+            stop = chr_lens[-1]
+            strings = self.file_read(start, stop)
+            rtn = split_strings(strings, start, chr_lens)
+            if self.map_fn is not None:
+                rtn = [self.map_fn(s) for s in rtn]
+        return rtn
+
+    def __len__(self):
+        return len(self.ends)
+
+    def file_read(self, start=0, end=None):
+        """read specified portion of file"""
+        data_type_size = np.dtype(self.array_data_type).itemsize
+        # atomic reads to avoid race conditions with multiprocess dataloader
+        self.read_lock.acquire()
+        if not self.mem_map and not self.load_memory:
+            # seek to start of file read
+            if self.is_array:
+                start = start * data_type_size
+                end = end * data_type_size if end is not None else None
+            self.file.seek(start)
+            # read to end of file if no end point provided
+            if end is None:
+                rtn = self.file.read()
+            # else read amount needed to reach end point
+            else:
+                rtn = self.file.read(end - start)
+            if self.is_array:
+                rtn = np.ndarray(
+                    shape=(len(rtn) // data_type_size, ),
+                    dtype=self.array_data_type,
+                    buffer=rtn,
+                    order='C')
+            else:
+                rtn = rtn.decode('utf-8', 'ignore')
+        else:
+            rtn = self.file[start:end]
+            if self.is_array:
+                rtn = rtn.copy()
+            else:
+                rtn = rtn.decode('utf-8', 'strict')
+        self.read_lock.release()
+        # TODO: @raulp figure out mem map byte string bug
+        # if mem map'd need to decode byte string to string
+        # # rtn = str(rtn)
+        # if self.mem_map:
+        #     rtn = rtn.decode('unicode_escape')
+        return rtn
diff --git a/modelscope/models/nlp/mglm/data_utils/samplers.py b/modelscope/models/nlp/mglm/data_utils/samplers.py
new file mode 100644
index 00000000..c0f6e1ab
--- /dev/null
+++ b/modelscope/models/nlp/mglm/data_utils/samplers.py
@@ -0,0 +1,190 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""batch samplers that work with either random or sequential data samplers"""
+import math
+import os
+import sys
+
+import numpy as np
+import torch
+from torch.utils import data
+
+
+class RandomSampler(data.sampler.Sampler):
+    r"""
+    Based off of pytorch RandomSampler and DistributedSampler. Essentially a RandomSampler,
+    but this class lets the user set an epoch like DistributedSampler
+    Samples elements randomly. If without replacement, then sample from a shuffled dataset.
+    If with replacement, then user can specify ``num_samples`` to draw.
+    Arguments:
+        data_source (Dataset): dataset to sample from
+        num_samples (int): number of samples to draw, default=len(dataset)
+        replacement (bool): samples are drawn with replacement if ``True``, default=False
+    """
+
+    def __init__(self, data_source, replacement=False, num_samples=None):
+        super(RandomSampler, self).__init__(data_source)
+        self.data_source = data_source
+        self.replacement = replacement
+        self._num_samples = num_samples
+        self.epoch = -1
+
+        if self._num_samples is not None and replacement is False:
+            raise ValueError(
+                'With replacement=False, num_samples should not be specified, '
+                'since a random permute will be performed.')
+
+        if not isinstance(self.num_samples, int) or self.num_samples <= 0:
+            raise ValueError('num_samples should be a positive integer '
+                             'value, but got num_samples={}'.format(
+                                 self.num_samples))
+        if not isinstance(self.replacement, bool):
+            raise ValueError('replacement should be a boolean value, but got '
+                             'replacement={}'.format(self.replacement))
+
+    @property
+    def num_samples(self):
+        # dataset size might change at runtime
+        if self._num_samples is None:
+            return len(self.data_source)
+        return self._num_samples
+
+    def __iter__(self):
+        n = len(self.data_source)
+        g = torch.Generator()
+        if self.epoch >= 0:
+            g.manual_seed(self.epoch)
+        if self.replacement:
+            for _ in range(self.num_samples // 32):
+                yield from torch.randint(
+                    high=n, size=(32, ), dtype=torch.int64,
+                    generator=g).tolist()
+            yield from torch.randint(
+                high=n,
+                size=(self.num_samples % 32, ),
+                dtype=torch.int64,
+                generator=g).tolist()
+        else:
+            yield from torch.randperm(n, generator=self.generator).tolist()
+
+    def __len__(self):
+        return self.num_samples
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+
+
+class DistributedSequentialSampler(data.sampler.Sampler):
+
+    def __init__(self,
+                 num_samples,
+                 train_iters,
+                 batch_size,
+                 rank=-1,
+                 world_size=2):
+        super().__init__(num_samples)
+        if rank == -1:
+            rank = 0
+            world_size = 1
+        self.num_samples = num_samples
+        self.rank = rank
+        self.world_size = world_size
+        self.start_iter = 0
+        self.train_iters = train_iters
+        self.batch_size = batch_size
+        self.batch_bias = [
+            i * (num_samples // batch_size) for i in range(batch_size)
+        ]
+
+    def __iter__(self):
+        for idx in range(self.start_iter, self.train_iters * 10):
+            batch = [(idx + bias) % self.num_samples
+                     for bias in self.batch_bias]
+            tbatch = self._batch(batch)
+            yield tbatch
+
+    def __len__(self):
+        return self.train_iters
+
+    def _batch(self, batch):
+        """extracts samples only pertaining to this worker's batch"""
+        start = self.rank * self.batch_size // self.world_size
+        end = (self.rank + 1) * self.batch_size // self.world_size
+        return batch[start:end]
+
+
+class DistributedBatchSampler(data.sampler.BatchSampler):
+    """
+    similar to normal implementation of distributed sampler, except implementation is at the
+    batch sampler level, instead of just the sampler level. This allows wrapping of arbitrary
+    data samplers (sequential, random, WeightedRandomSampler, etc.) with this batch sampler.
+    """
+
+    def __init__(self,
+                 sampler,
+                 batch_size,
+                 drop_last,
+                 rank=-1,
+                 world_size=2,
+                 wrap_last=False,
+                 gradient_accumulation_steps=None):
+        super(DistributedBatchSampler, self).__init__(sampler, batch_size,
+                                                      drop_last)
+        if rank == -1:
+            assert False, 'should not be here'
+        self.rank = rank
+        self.world_size = world_size
+        self.sampler.wrap_around = 0
+        self.wrap_around = 0
+        self.wrap_last = wrap_last
+        self.start_iter = 0
+        self.effective_batch_size = batch_size if gradient_accumulation_steps is None else batch_size * gradient_accumulation_steps  # noqa
+
+    def __iter__(self):
+        batch = []
+        i = 0
+        for idx in self.data_iterator(self.sampler, wrap_around=False):
+            batch.append(idx)
+            if len(batch) == self.batch_size:
+                tbatch = self._batch(batch)
+                if i >= self.start_iter * self.effective_batch_size:
+                    yield tbatch
+                    self.start_iter = 0
+                i += len(batch)
+                batch = []
+        batch_len = len(batch)
+        if batch_len > 0 and not self.drop_last:
+            if self.wrap_last:
+                self.sampler.wrap_around -= (self.batch_size)
+                self.wrap_around += (len(batch))
+                self.wrap_around %= self.batch_size
+            yield self._batch(batch)
+        if self.wrap_last:
+            self.sampler.wrap_around += self.batch_size
+
+    def data_iterator(self, _iter, wrap_around=False):
+        """iterates through data and handles wrap around"""
+        for i, idx in enumerate(_iter):
+            if i < self.wrap_around % self.batch_size:
+                continue
+            if wrap_around:
+                self.wrap_around += 1
+                self.wrap_around %= self.batch_size
+            yield idx
+
+    def _batch(self, batch):
+        """extracts samples only pertaining to this worker's batch"""
+        start = self.rank * self.batch_size // self.world_size
+        end = (self.rank + 1) * self.batch_size // self.world_size
+        return batch[start:end]
diff --git a/modelscope/models/nlp/mglm/data_utils/sp_tokenizer.py b/modelscope/models/nlp/mglm/data_utils/sp_tokenizer.py
new file mode 100644
index 00000000..b4d1afe3
--- /dev/null
+++ b/modelscope/models/nlp/mglm/data_utils/sp_tokenizer.py
@@ -0,0 +1,158 @@
+# Modified by Zhipu.AI
+"""
+from https://github.com/openai/gpt-2/, changed for chinese
+"""
+import os  # yapf: disable
+
+
+"""
+SentencePiece is an unsupervised text tokenizer and detokenizer mainly for Neural Network-based text generation
+systems where the vocabulary size is predetermined prior to the neural model training. SentencePiece implements
+subword units (e.g., byte-pair-encoding (BPE) [Sennrich et al.]) and unigram language model [Kudo.]) with the
+extension of direct training from raw sentences. SentencePiece allows us to make a purely end-to-end
+system that does not depend on language-specific pre/postprocessing.
+https://github.com/google/sentencepiece
+
+pip install sentencepiece
+
+or  git clone https://github.com/google/sentencepiece.git
+python setup.py install
+
+"""
+
+
+def get_pairs(word):
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+class Encoder:
+
+    def __init__(self, encoder, bpe_merges):
+        self.encoder = encoder
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+        self.cache = {}
+        self.max_len = 0
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token)
+        pairs = get_pairs(word)
+        if not pairs:
+            return token
+
+        while True:
+            bigram = min(
+                pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:  # noqa
+                    new_word.extend(word[i:])
+                    break
+
+                if word[i] == first and i < len(word) - 1 and word[
+                        i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+
+    def encode(self, text):
+        return [self.encoder.get(token, 1) for token in self.tokenize(text)]
+
+    def decode(self, tokens):
+        text = ''.join([self.decoder[token] for token in tokens])
+        return text
+
+    def tokenize(self, text):
+        bpe_tokens = []
+        bpe_tokens.extend(bpe_token for bpe_token in self.bpe(text).split(' '))
+        return bpe_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        return [self.encoder.get(token, 1) for token in tokens]
+
+
+class Encoder_SP:
+
+    def __init__(self, model_path):
+        import sentencepiece as spm
+        self.sp = spm.SentencePieceProcessor()
+        self.sp.Load(model_path)
+
+    def encode(self, text):
+        """
+        text="...."
+        """
+        return self.sp.EncodeAsIds(text)
+
+    def decode(self, tokens):
+        """
+        tokens=[x1,x2,...]
+        """
+        text = [int(token) for token in tokens]
+        # print(text)
+        return self.sp.DecodeIds(text)
+
+    def tokenize(self, text):
+        return self.sp.EncodeAsPieces(text)
+
+    def convert_tokens_to_ids(self, tokens):
+        return [self.sp.PieceToId(token) for token in tokens]
+
+    def convert_token_to_id(self, token):
+        return self.sp.PieceToId(token)
+
+    def convert_id_to_token(self, idx):
+        return self.sp.IdToPiece(idx)
+
+
+def get_encoder(encoder_file, bpe_file):
+    import json
+    filepath, filename = os.path.split(encoder_file)
+    shotname, extension = os.path.splitext(filename)
+
+    if ('.model' == extension) and (bpe_file == ''):
+        return Encoder_SP(encoder_file)
+    else:
+        with open(encoder_file, 'r', encoding='utf-8') as f:
+            encoder = json.load(f)
+        with open(bpe_file, 'r', encoding='utf-8') as f:
+            bpe_data = f.read()
+        bpe_merges = [
+            tuple(merge_str.split())
+            for merge_str in bpe_data.split('\n')[1:-1]
+        ]
+        return Encoder(
+            encoder=encoder,
+            bpe_merges=bpe_merges,
+        )
+
+
+def from_pretrained(model_path):
+    return get_encoder(model_path + '/tokenizer/mglm250k/mglm250k-uni.model',
+                       '')
diff --git a/modelscope/models/nlp/mglm/data_utils/tokenization.py b/modelscope/models/nlp/mglm/data_utils/tokenization.py
new file mode 100755
index 00000000..c89cc371
--- /dev/null
+++ b/modelscope/models/nlp/mglm/data_utils/tokenization.py
@@ -0,0 +1,1396 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utilities for using and training tokenizers (char, wordpiece, sentencepiece)"""
+import csv
+import itertools
+import os
+import random
+from collections import namedtuple
+
+import nltk
+import regex as re
+import sentencepiece as spm
+import torch
+from nltk import tokenize as nltk_tokenize
+
+from . import sp_tokenizer
+from .tokenization_gpt2 import GPT2Tokenizer
+from .wordpiece import PRETRAINED_VOCAB_ARCHIVE_MAP, BertTokenizer
+
+
+def make_tokenizer(tokenizer_type,
+                   corpus,
+                   model_path=None,
+                   vocab_size=None,
+                   model_type=None,
+                   pad_token=0,
+                   character_coverage=1.0,
+                   command_tokens=None,
+                   type_tokens=None,
+                   **kwargs):
+    """
+    Helper function to instantiate a tokenizer given common combinations of options.
+    """
+    tokenizer_class = tokenizer_type
+    if isinstance(tokenizer_class, str):
+        tokenizer_class = eval(tokenizer_class)
+    if tokenizer_class is BertWordPieceTokenizer:
+        return BertWordPieceTokenizer(model_type, **kwargs)
+    elif tokenizer_class is GPT2BPETokenizer:
+        if model_type is None:
+            model_type = 'gpt2'
+        return GPT2BPETokenizer(model_type, **kwargs)
+    elif tokenizer_class is ChineseSPTokenizer:
+        return ChineseSPTokenizer(model_path, **kwargs)
+    text_tokenizer = tokenizer_class(
+        corpus=corpus,
+        vocab_size=vocab_size,
+        model_path=model_path,
+        model_type=model_type,
+        pad_token=pad_token,
+        character_coverage=character_coverage)
+    return Tokenizer(text_tokenizer, command_tokens, type_tokens)
+
+
+class Tokenization(object):
+    """
+    Tokenization object to hold tokenization, (processed text),and original
+    text. Can hold tokenization as Ids or tokens.
+
+    It also holds command tokens (pad, unk, etc.) for the tokenization.
+    This allows functions to pad/operate on tokenizations without having
+    access to the full tokenizer, just the tokenization.
+
+    Several standard array operations are implemented (insert, append, extend).
+    """
+
+    def __init__(self,
+                 tokenization,
+                 text=None,
+                 original_text=None,
+                 command_tokens=None,
+                 asIds=True):
+        self.tokenization = tokenization
+        self.text = text
+        if self.text is None:
+            self.text = self.tokenization
+        self.original_text = original_text
+        if self.original_text is None:
+            self.original_text = self.text
+        self.command_tokens = command_tokens
+        self.asIds = asIds
+        self.parse_command_tokens()
+
+    def set_command_tokens(self, command_tokens):
+        self.command_tokens = command_tokens
+        return self.parse_command_tokens()
+
+    def parse_command_tokens(self):
+        if self.command_tokens is None:
+            return
+        for command_token in self.command_tokens:
+            if self.asIds:
+                setattr(self, command_token.name, command_token.Id)
+            else:
+                setattr(self, command_token.name, command_token.token)
+
+    def __getitem__(self, index):
+        return self.tokenization[index]
+
+    def __len__(self):
+        return len(self.tokenization)
+
+    def insert(self, idx, other):
+        if isinstance(other, (CommandToken, TypeToken)):
+            self.tokenization.insert(idx, other.Id)
+            if idx == 0:
+                self.text = other.token + self.text
+                self.original_text = other.token + self.original_text
+            elif idx == len(self.tokenization) - 1:
+                self.text += other.token
+                self.original_text += other.token
+        elif isinstance(other, Tokenization):
+            self.tokenization = self.tokenization[:
+                                                  idx] + other.tokenization + self.tokenization[
+                                                      idx:]
+        else:
+            self.tokenization = self.tokenization[:
+                                                  idx] + other.tokenization + self.tokenization[
+                                                      idx:]
+
+    def append(self, other):
+        if isinstance(other, (CommandToken, TypeToken)):
+            self.tokenization.append(other.Id)
+            self.text += other.token
+            self.original_text += other.token
+        elif isinstance(other, Tokenization):
+            self.tokenization.extend(other.tokenization)
+            self.text += other.text
+            self.original_text += other.original_text
+        else:
+            self.tokenization.append(other)
+        return self
+
+    def extend(self, other):
+        if isinstance(other, (CommandToken, TypeToken)):
+            self.tokenization.append(other.Id)
+            self.text += other.token
+            self.original_text += other.token
+        elif isinstance(other, list) and isinstance(other[0],
+                                                    (CommandToken, TypeToken)):
+            self.tokenization.extend([o.Id for o in other])
+            self.text += [o.token for o in other]
+            self.original_text += [o.token for o in other]
+        elif isinstance(other, Tokenization):
+            self.tokenization.extend(other.tokenization)
+            self.text += other.text
+            self.original_text += other.original_text
+        else:
+            self.tokenization.extend(other)
+        return self
+
+
+"""define some default command tokens for the tokenizer to use"""
+token_format = '<{0}>'
+
+COMMAND_TUPLE = namedtuple('CommandToken', ('name', 'token', 'Id'))
+
+
+def prep_command_tokens(tokenlist, token_format=token_format):
+    return [
+        CommandToken(tok[0], token_format.format(tok[0]), tok[1])
+        for tok in tokenlist
+    ]
+
+
+class CommandToken(object):
+
+    def __init__(self, name, token, Id, lstrip=False, rstrip=False):
+        self.name = name
+        self.token = token
+        self.Id = Id
+        self.lstrip = lstrip
+        self.rstrip = rstrip
+
+    def __str__(self):
+        return str(COMMAND_TUPLE(self.name, self.token, self.Id))
+
+
+DEFAULT_COMMAND_TOKENS = [
+    ('pad', 0),
+    ('eos', 1),
+    ('bos', 2),
+    ('unk', 3),
+    ('sep', 4),
+    ('L2R', 5),
+    ('ENC', 6),
+    ('MASK', 7),
+]
+DEFAULT_COMMAND_TOKENS = prep_command_tokens(DEFAULT_COMMAND_TOKENS)
+"""define some default type tokens for bert training"""
+
+TYPE_TUPLE = namedtuple('TypeToken', ('name', 'token', 'Id'))
+
+
+def prep_type_tokens(tokenlist, token_format=token_format):
+    return [
+        TypeToken(tok[0], token_format.format(tok[0]), tok[1])
+        for tok in tokenlist
+    ]
+
+
+class TypeToken(object):
+
+    def __init__(self, name, token, Id):
+        self.name = name
+        self.token = token
+        self.Id = Id
+
+    def __str__(self):
+        return str(TYPE_TUPLE(self.name, self.token, self.Id))
+
+
+DEFAULT_TYPE_TOKENS = [
+    ('function', 0),
+    ('command', 1),
+    ('str0', 2),
+    ('str1', 3),
+    ('str2', 4),
+    ('embedding0', 5),
+    ('embedding1', 6),
+    ('embedding2', 7),
+    ('arg0', 8),
+    ('arg1', 9),
+    ('arg2', 10),
+]
+DEFAULT_TYPE_TOKENS = prep_type_tokens(DEFAULT_TYPE_TOKENS)
+
+
+class Tokenizer(object):
+    """
+    Tokenizer object that handles text tokenization, command tokens, and type tokens.
+
+    Command tokens and text tokens are stored together in one mapping of size
+    `len(text_tokenizer)+len(command_tokens)`. Command tokens are stored as first
+    `len(command_tokens)` tokens. Token idx is stored at `idx+len(command_tokens)`.
+
+    Token types are stored in a separate mapping of size `len(type_tokens)`.
+    """
+
+    def __init__(self, text_tokenizer, command_tokens=None, type_tokens=None):
+        # set text tokenizer
+        self.text_tokenizer = text_tokenizer
+        if not hasattr(self, 'num_text_tokens'):
+            self.num_text_tokens = len(self.text_tokenizer)
+
+        # set command tokens
+        if command_tokens is None:
+            command_tokens = DEFAULT_COMMAND_TOKENS
+        self._command_tokens = command_tokens
+        self.command_name_map = {tok.name: tok for tok in self._command_tokens}
+        self.command_token_map = {
+            tok.token: tok
+            for tok in self._command_tokens
+        }
+        self.command_id_map = {tok.Id: tok for tok in self._command_tokens}
+        if not hasattr(self, 'num_command_tokens'):
+            self.num_command_tokens = len(self._command_tokens)
+        if not hasattr(self, 'num_tokens'):
+            self.num_tokens = self.num_command_tokens + self.num_text_tokens
+
+        # set type tokens
+        if type_tokens is None:
+            type_tokens = DEFAULT_TYPE_TOKENS
+        self.type_tokens = type_tokens
+        self.type_name_map = {tok.name: tok for tok in self.type_tokens}
+        self.type_token_map = {tok.token: tok for tok in self.type_tokens}
+        self.type_id_map = {tok.Id: tok for tok in self.type_tokens}
+        if not hasattr(self, 'num_type_tokens'):
+            self.num_type_tokens = len(self.type_tokens)
+
+        # parse tokens and vocabs from tokenizer
+        self._tokens = list(self.command_token_map.keys()) + list(
+            self.text_tokenizer.tokens)
+        self._vocab = {t: Id for Id, t in self.command_id_map.items()}
+        self._vocab.update({
+            t: Id + self.num_command_tokens
+            for t, Id in self.text_tokenizer.vocab.items()
+        })
+
+        self._text_tokens = list(self.text_tokenizer.tokens)
+        self._text_token_vocab = {
+            t: Id + self.num_command_tokens
+            for t, Id in self.text_tokenizer.vocab.items()
+        }
+
+        self._command_token_tokens = list(self.command_token_map.keys())
+        self._command_token_vocab = {
+            t: Id
+            for Id, t in self.command_id_map.items()
+        }
+
+        self._token_types = list(self.type_token_map.keys())
+        self._token_type_vocab = {t: Id for Id, t in self.type_id_map.items()}
+
+    def __call__(self, text, process_fn=None):
+        """run preprocessing and encode text as Ids"""
+        return self.EncodeAsIds(text, process_fn=process_fn)
+
+    def __len__(self):
+        """total number of tokens"""
+        return self.num_tokens
+
+    def get_command(self, name):
+        """get command token corresponding to `name`"""
+        return self.command_name_map[name]
+
+    def get_type(self, name):
+        """get type token corresponding to `name`"""
+        return self.type_name_map[name]
+
+    @property
+    def tokens(self):
+        """list (or iterable) of all tokens for tokenizer"""
+        return self._tokens
+
+    @property
+    def vocab(self):
+        """dictionary mapping tokens to ids for tokenizer"""
+        return self._vocab
+
+    @property
+    def token_types(self):
+        """list (or iterable) of all token types for tokenizer"""
+        return self._token_types
+
+    @property
+    def token_type_vocab(self):
+        """dictionary mapping token types to ids for tokenizer"""
+        return self._token_type_vocab
+
+    @property
+    def command_tokens(self):
+        """list (or iterable) of all command tokens for tokenizer"""
+        return self._command_token_tokens
+
+    @property
+    def command_token_vocab(self):
+        """dictionary mapping command tokens to ids for tokenizer"""
+        return self._command_token_vocab
+
+    @property
+    def text_tokens(self):
+        """list (or iterable) of text tokens for text tokenizer"""
+        return self._text_tokens
+
+    @property
+    def text_token_vocab(self):
+        """dictionary mapping text tokens to ids for text tokenizer"""
+        return self._text_token_vocab
+
+    def EncodeAsIds(self, text, process_fn=None):
+        """
+        encode text using text tokenizer and shift Id values for command tokens
+        """
+        processed_text = text
+        if process_fn is not None:
+            processed_text = process_fn(processed_text)
+
+        def split_on_token(tok_extended: CommandToken, text):
+            result = []
+            tok = tok_extended.token
+            split_text = text.split(tok)
+            for i, sub_text in enumerate(split_text):
+                # CommandToken can control whitespace stripping around them.
+                # We use them for GPT2 and Roberta to have different behavior depending on the special token
+                # Cf. https://github.com/huggingface/transformers/pull/2778
+                # and https://github.com/huggingface/transformers/issues/3788
+                # Strip white spaces on the right
+                if tok_extended.rstrip and i > 0:
+                    # A bit counter-intuitive but we strip the left of the string
+                    # since tok_extended.rstrip means the special token is eating all white spaces on its right
+                    sub_text = sub_text.lstrip()
+                # Strip white spaces on the left
+                if tok_extended.lstrip and i < len(split_text) - 1:
+                    sub_text = sub_text.rstrip()  # Opposite here
+
+                if i == 0 and not sub_text:
+                    result.append(tok)
+                elif i == len(split_text) - 1:
+                    if sub_text:
+                        result.append(sub_text)
+                    else:
+                        pass
+                else:
+                    if sub_text:
+                        result.append(sub_text)
+                    result.append(tok)
+            return result
+
+        def split_on_tokens(tok_list, text):
+            if not text.strip():
+                return []
+            if not tok_list:
+                return self.text_tokenizer.encode(text)
+
+            tokenized_text = []
+            text_list = [text]
+            for tok in tok_list:
+                tokenized_text = []
+                for sub_text in text_list:
+                    if sub_text not in self._command_token_tokens:
+                        tokenized_text.extend(split_on_token(tok, sub_text))
+                    else:
+                        tokenized_text.append(sub_text)
+                text_list = tokenized_text
+
+            return list(
+                itertools.chain.from_iterable(
+                    (self._encode(token)
+                     if token not in self._command_token_tokens else
+                     [self.command_token_map[token].Id]
+                     for token in tokenized_text)))
+
+        no_split_tokens = self._command_tokens
+        Ids = split_on_tokens(no_split_tokens, processed_text)
+        tokenization = Tokenization(Ids, processed_text, text)
+        tokenization.set_command_tokens(self._command_tokens)
+        return tokenization
+
+    def _encode(self, text):
+        raise NotImplementedError
+
+    def EncodeAsTokens(self, text, process_fn=None):
+        """
+        encode text as tokens using text tokenizer
+        """
+        tokenization = self.text_tokenizer.EncodeAsTokens(
+            text, process_fn=process_fn)
+        tokenization.set_command_tokens(self._command_tokens)
+        return tokenization
+
+    def IdToToken(self, Id, type_token=False):
+        """convert Id to token accounting for command and type tokens"""
+        if isinstance(Id, (TypeToken, CommandToken)):
+            return Id.token
+        if type_token:
+            return self.type_id_map[Id].token
+        if Id < self.num_command_tokens:
+            return self.command_id_map[Id].token
+        return self.text_tokenizer.IdToToken(Id - self.num_command_tokens)
+
+    def TokenToId(self, token, type_token=False):
+        """convert token to Id accounting for command and type tokens"""
+        if isinstance(token, (TypeToken, CommandToken)):
+            return token.Id
+        if type_token:
+            return self.type_token_map[token].Id
+        if token in self.command_token_map:
+            return self.command_token_map[token].Id
+        return self.text_tokenizer.TokenToId(token) + self.num_command_tokens
+
+    def DecodeIds(self, Ids, type_token=False):
+        """
+        convert Ids to tokens accounting for command and type tokens, tokens
+        are joined and returned as a string.
+        """
+        if type_token:
+            return ' '.join(
+                Id.token if isinstance(Id, TypeToken) else self.
+                type_id_map[Id].token for Id in Ids)
+        rtn_strs = []
+        current_str = []
+        if isinstance(Ids, Tokenization):
+            Ids = Ids.tokenization
+        for Id in Ids:
+            if isinstance(Id, CommandToken):
+                rtn_strs.append(self.text_tokenizer.DecodeIds(current_str))
+                current_str = []
+                rtn_strs.append(Id.token)
+            elif Id < self.num_command_tokens:
+                rtn_strs.append(self.text_tokenizer.DecodeIds(current_str))
+                current_str = []
+                rtn_strs.append(self.command_id_map[Id].token)
+            else:
+                current_str.append(Id - self.num_command_tokens)
+        if current_str != []:
+            rtn_strs.append(self.text_tokenizer.DecodeIds(current_str))
+        return ' '.join(rtn_strs)
+
+    def DecodeTokens(self, Tokens, type_token=False):
+        """
+        convert tokens to a string accounting for command and type tokens.
+        """
+        if type_token:
+            return ' '.join(
+                t.token if isinstance(t, TypeToken) else t for t in Tokens)
+        rtn_strs = []
+        current_str = []
+        if isinstance(Tokens, Tokenization):
+            Tokens = Tokens.tokenization
+        for t in Tokens:
+            if isinstance(t, CommandToken):
+                rtn_strs.append(self.text_tokenizer.DecodeTokens(current_str))
+                current_str = []
+                rtn_strs.append(t.token)
+            elif t in self.command_token_map:
+                rtn_strs.append(self.text_tokenizer.DecodeTokens(current_str))
+                current_str = []
+                rtn_strs.append(t)
+            else:
+                current_str.append(t)
+        if current_str != []:
+            rtn_strs.append(self.text_tokenizer.DecodeTokens(current_str))
+        return ' '.join(rtn_strs)
+
+
+class TextTokenizer(object):
+    """
+    Interface for text tokenizer
+    """
+
+    def __init__(self):
+        if not hasattr(self, 'num_text_tokens'):
+            self.num_text_tokens = 0
+        if not hasattr(self, 'num_tokens'):
+            self.num_tokens = self.num_text_tokens
+
+    def __call__(self, text, process_fn=None):
+        return self.EncodeAsIds(text, process_fn)
+
+    def __len__(self):
+        return self.num_text_tokens
+
+    @property
+    def tokens(self):
+        """list (or iterable) of text tokens for text tokenizer"""
+        raise NotImplementedError(
+            'TextTokenizer tokens property not implemented')
+
+    @property
+    def vocab(self):
+        """dictionary mapping tokens to ids"""
+        raise NotImplementedError(
+            'TextTokenizer vocab property not implemented')
+
+    @staticmethod
+    def exists(model_path):
+        """check if the filepath for a text tokenizer exists"""
+        raise NotImplementedError(
+            'TextTokenizer exists method not implemented')
+
+    def Train(self, corpus):
+        """train a tokenizer on a data corpus and save model for future use"""
+        raise NotImplementedError('TextTokenizer Train not implemented')
+
+    def EncodeAsIds(self, text, process_fn=None):
+        """
+        Preprocess text and encode as ids. Return a tokenization object with
+        original text, processed text, and id tokenization.
+        """
+        raise NotImplementedError('TextTokenizer EncodeAsIds not implemented')
+
+    def EncodeAsTokens(self, text, process_fn=None):
+        """
+        Preprocess text and encode as tokens. Return a tokenization object with
+        original text, processed text, and token tokenization.
+        """
+        raise NotImplementedError(
+            'TextTokenizer EncodeAsTokens not implemented')
+
+    def IdToToken(self, Id):
+        """Convert an Id to Token. Reverse lookup of self.vocab"""
+        raise NotImplementedError('TextTokenizer IdToToken not implemented')
+
+    def TokenToId(self, token):
+        """Convert a Token to Id. Lookup of self.vocab"""
+        raise NotImplementedError('TextTokenizer TokenToId not implemented')
+
+    def DecodeIds(self, Ids):
+        """Convert a list or tokenization object of Ids to a text string"""
+        raise NotImplementedError('TextTokenizer DecodeIds not implemented')
+
+    def DecodeTokens(self, Tokens):
+        """Convert a list or tokenization object of tokens to a text string"""
+        raise NotImplementedError('TextTokenizer DecodeTokens not implemented')
+
+
+class CharacterLevelTokenizer(TextTokenizer):
+    """
+    Text tokenizer for ASCII-256 Character Level Tokenization.
+    """
+
+    def __init__(self, **kwargs):
+        self.num_text_tokens = 256
+        super(CharacterLevelTokenizer, self).__init__()
+        self._tokens = [
+            self.IdToToken(Id) for Id in range(self.num_text_tokens)
+        ]
+        self._vocab = {t: i for i, t in enumerate(self._tokens)}
+
+    def __len__(self):
+        return 256
+
+    @staticmethod
+    def exists(model_path):
+        return True
+
+    def Train(self, corpus):
+        pass
+
+    @property
+    def tokens(self):
+        return self._tokens
+
+    @property
+    def vocab(self):
+        return self._vocab
+
+    def EncodeAsIds(self, text, process_fn=None):
+        """convert text to ascii 256 Ids"""
+        processed_text = text
+        if process_fn is not None:
+            processed_text = process_fn(processed_text)
+            processed_text = str(processed_text)
+        tokens = [self.TokenToId(c) for c in processed_text]
+        return Tokenization(tokens, processed_text, text)
+
+    def EncodeAsTokens(self, text, process_fn=None):
+        """convert text to ascii 256 characters"""
+        processed_text = text
+        if process_fn is not None:
+            processed_text = process_fn(processed_text)
+        processed_text = str(processed_text)
+        tokens = [c for c in processed_text]
+        return Tokenization(tokens, processed_text, text, asIds=False)
+
+    def IdToToken(self, Id):
+        """ascii index to character"""
+        return chr(Id)
+
+    def TokenToId(self, token):
+        """ascii character to index"""
+        return ord(token)
+
+    def DecodeIds(self, Ids):
+        """converts ascii ids to tokens before joining them into text"""
+        if isinstance(Ids, Tokenization):
+            Ids = Ids.tokenization
+        return ''.join([self.IdToToken(tok) for tok in Ids])
+
+    def DecodeTokens(self, Tokens):
+        """just concatenates ascii tokens into text"""
+        if isinstance(Tokens, Tokenization):
+            Tokens = Tokens.tokenization
+        return ''.join(Tokens)
+
+
+MAX_SENTENCEPIECE_SENTENCES = 100000000
+
+
+def get_corpus_freq(dataset, filepath, filetype='tsv'):
+    """
+    Take corpus, split it into sentences, and extract word frequencies.
+    Write frequencies to `filepath` as a tsv. Only write the first
+    MAX_SENTENCEPIECE_SENTENCES most common words to the file.
+    """
+    nltk.download('punkt', download_dir='./nltk')
+    if filetype == 'tsv':
+        delimiter = '\t'
+    else:
+        delimiter = ','
+
+    print('compute corpus frequency\n', flush=True)
+
+    total_sentence_count = 0
+    maxlen = 0
+    freqs = {}
+    for entry in dataset:
+        if isinstance(entry, dict):
+            entry = entry['text']
+        lines = entry.strip().split('\n')
+        for line in lines:
+            sentences = nltk_tokenize.sent_tokenize(line)
+            total_sentence_count += len(sentences)
+            for sentence in sentences:
+                maxlen = max(len(line), maxlen)
+                for word in sentence.split():
+                    if word not in freqs:
+                        freqs[word] = 0
+                    freqs[word] += 1
+
+    print('length of freqs before truncating ' + str(len(freqs)), flush=True)
+    print('file path for freq ' + str(filepath), flush=True)
+
+    freqs_sorted = {}
+    counter = 0
+    for word, count in sorted(freqs.items(), key=lambda x: x[1], reverse=True):
+        if counter >= MAX_SENTENCEPIECE_SENTENCES:
+            break
+        counter += 1
+        freqs_sorted[word] = count
+
+    print(
+        'length of freqs after trancating ' + str(len(freqs_sorted)),
+        flush=True)
+
+    with open(filepath, 'w') as f:
+        writer = csv.writer(f, delimiter=delimiter)
+        for k, v in freqs_sorted.items():
+            writer.writerow([str(k), str(v)])
+
+    return total_sentence_count, maxlen
+
+
+class SentencePieceTokenizer(TextTokenizer):
+    """Trains and uses sentencepiece for text tokenization"""
+
+    def __init__(self,
+                 model_type='bpe',
+                 vocab_size=None,
+                 corpus=None,
+                 model_path=None,
+                 character_coverage=1.0,
+                 **kwargs):
+        self.character_coverage = character_coverage
+        self.model_type = model_type.lower()
+        self.spm_model = model_path
+        self.num_text_tokens = vocab_size
+        make_train = not SentencePieceTokenizer.exists(self.spm_model)
+        if make_train:
+            assert corpus is not None and self.num_text_tokens is not None
+            self.Train(corpus, self.num_text_tokens)
+        self._tokens = []
+        self._vocab = {}
+        self.load_spm_model()
+        super(SentencePieceTokenizer, self).__init__()
+
+    def __len__(self):
+        return self.num_text_tokens
+
+    @property
+    def tokens(self):
+        return self._tokens
+
+    @property
+    def vocab(self):
+        return self._vocab
+
+    @staticmethod
+    def exists(model_path):
+        if model_path is None:
+            return False
+        # check if path exists
+        dne = not os.path.exists(model_path)
+        # check if path.model exists
+        if dne and not model_path.endswith('.model'):
+            dne = not os.path.exists(model_path + '.model')
+        return not dne
+
+    def load_spm_model(self):
+        """load sentencepiece model and parse vocab"""
+        if not os.path.exists(
+                self.spm_model) and not self.spm_model.endswith('.model'):
+            self.spm_model = self.spm_model + '.model'
+        self.sp = spm.SentencePieceProcessor()
+        self.sp.Load(self.spm_model)
+        self.vocab_size = self.num_text_tokens = len(self.sp)
+        self._tokens = [self.IdToToken(t) for t in range(self.vocab_size)]
+        self._vocab = {t: i for i, t in enumerate(self._tokens)}
+
+    def Train(self, corpus, num_text_tokens):
+        """train sentencepiece model on corpus using word frequencies"""
+        self.num_text_tokens = num_text_tokens
+        use_model_path = self.spm_model
+        random_hash = str(random.randint(0, 2147483647))
+        if use_model_path is None:
+            use_model_path = random_hash
+        if use_model_path.endswith('.model'):
+            use_model_path = use_model_path[:use_model_path.rfind('.model')]
+        input_path = use_model_path + '.tsv.' + random_hash
+        line_count, maxlenline = get_corpus_freq(corpus, input_path)
+        line_count = min(line_count, MAX_SENTENCEPIECE_SENTENCES)
+        print(
+            'line count used as input_sentence_size ', line_count, flush=True)
+        print('training sentencepiece model', flush=True)
+        train_string = '--input={file_path} --model_prefix={model_prefix} --vocab_size={vocab_size}' \
+                       + ' --model_type={model_type} --character_coverage={character_coverage} ' \
+                       + '--input_sentence_size={input_sentence_size} ' \
+                       + '--input_format=tsv'
+        train_string = train_string.format(
+            file_path=input_path,
+            model_prefix=use_model_path,
+            vocab_size=num_text_tokens,
+            model_type=self.model_type,
+            character_coverage=self.character_coverage,
+            input_sentence_size=int(line_count))  # , #)#,
+        print(
+            'calling spm.SentencePieceTrainer.Train(%s)' % (train_string),
+            flush=True)
+        spm.SentencePieceTrainer.Train(train_string)
+        os.remove(input_path)
+        self.spm_model = use_model_path + '.model'
+        print('sentencepiece model written to ' + self.spm_model, flush=True)
+
+    def EncodeAsIds(self, text, process_fn=None):
+        """convert text to sentencepiece Ids"""
+        processed_text = text
+        if process_fn is not None:
+            processed_text = process_fn(processed_text)
+        tokens = self.sp.EncodeAsIds(processed_text)
+        return Tokenization(tokens, processed_text, text)
+
+    def EncodeAsTokens(self, text, process_fn=None):
+        """convert text to sentencepiece tokens"""
+        processed_text = text
+        if process_fn is not None:
+            processed_text = process_fn(processed_text)
+        tokens = self.sp.EncodeAsTokens(processed_text)
+        return Tokenization(tokens, processed_text, text, asIds=False)
+
+    def IdToToken(self, Id):
+        """convert Id to sentencpiece token"""
+        return self.sp.IdToPiece(Id)
+
+    def TokenToId(self, token):
+        """convert sentencpiece token to Id"""
+        return self.sp.PieceToId(token)
+
+    def DecodeIds(self, Ids):
+        """converts ids to a text string"""
+        if isinstance(Ids, Tokenization):
+            Ids = Ids.tokenization
+        return self.sp.DecodeIds(Ids)
+
+    def DecodeTokens(self, Tokens):
+        """converts sentencepiece tokens to a text string"""
+        if isinstance(Tokens, Tokenization):
+            Tokens = Tokens.tokenization
+        return self.sp.DecodeTokens(Tokens)
+
+
+class BertWordPieceTokenizer(Tokenizer):
+    """
+    Loads a pretrained WordPiece tokenizer from `cache_dir` for tokenization
+    in BERT training. Default to bert-large-uncased tokenizer.
+    """
+
+    def __init__(self,
+                 tokenizer_model_type=None,
+                 cache_dir=None,
+                 add_block_symbols=False,
+                 add_sentinel_token=0,
+                 add_task_mask=False,
+                 add_decoder_mask=False,
+                 **kwargs):
+        # default to bert-large-uncased tokenizer
+        if tokenizer_model_type not in PRETRAINED_VOCAB_ARCHIVE_MAP:
+            tokenizer_model_type = 'bert-large-uncased'
+        if not torch.distributed.is_initialized(
+        ) or torch.distributed.get_rank() == 0:
+            print('loading BertWordPieceTokenizer (', tokenizer_model_type,
+                  ') from cache_dir ', cache_dir)
+        do_lower_case = not ('-cased' in tokenizer_model_type
+                             or 'chinese' in tokenizer_model_type)
+        self.text_tokenizer = BertTokenizer.from_pretrained(
+            tokenizer_model_type,
+            do_lower_case=do_lower_case,
+            cache_dir=cache_dir)
+        if not torch.distributed.is_initialized(
+        ) or torch.distributed.get_rank() == 0:
+            print('loaded', tokenizer_model_type)
+        # disable max len warnings by increasing max len
+        self.text_tokenizer.max_len = int(1e12)
+
+        # set command tokens from wordpiece tokenizer values
+        self.num_command_tokens = 6
+        self.num_tokens = len(self.text_tokenizer.vocab)
+        self.num_text_tokens = self.num_tokens - 5
+        self.num_type_tokens = 2
+
+        self._command_tokens = [
+            CommandToken('pad', '[PAD]', self.text_tokenizer.vocab['[PAD]']),
+            CommandToken('ENC', '[CLS]', self.text_tokenizer.vocab['[CLS]']),
+            CommandToken('MASK', '[MASK]',
+                         self.text_tokenizer.vocab['[MASK]']),
+            CommandToken('unk', '[UNK]', self.text_tokenizer.vocab['[UNK]']),
+            CommandToken('sep', '[SEP]', self.text_tokenizer.vocab['[SEP]']),
+            CommandToken('eos', '[PAD]', self.text_tokenizer.vocab['[PAD]']),
+        ]
+        if add_block_symbols:
+            self._command_tokens.extend([
+                CommandToken('sop', '<|startofpiece|>', self.num_tokens),
+                CommandToken('eop', '<|endofpiece|>', self.num_tokens + 1)
+            ])
+            self.num_tokens += 2
+            self.num_command_tokens += 2
+            if add_task_mask:
+                self._command_tokens.extend([
+                    CommandToken('gMASK', '[gMASK]', self.num_tokens),
+                    CommandToken('sMASK', '[sMASK]', self.num_tokens + 1)
+                ])
+                self.num_tokens += 2
+                self.num_command_tokens += 2
+            if add_decoder_mask:
+                self._command_tokens.extend(
+                    [CommandToken('dBLOCK', '[dBLOCK]', self.num_tokens)])
+                self.num_tokens += 1
+                self.num_command_tokens += 1
+        if add_sentinel_token > 0:
+            for i in range(1, add_sentinel_token):
+                self._command_tokens.extend([
+                    CommandToken(f'MASK{i}', f'[MASK{i}]', self.num_tokens),
+                    CommandToken(f'sop{i}', f'<|startofpiece{i}|>',
+                                 self.num_tokens + 1)
+                ])
+                self.num_tokens += 2
+                self.num_command_tokens += 2
+        self.command_name_map = {tok.name: tok for tok in self._command_tokens}
+        self.command_token_map = {
+            tok.token: tok
+            for tok in self._command_tokens
+        }
+        self.command_id_map = {tok.Id: tok for tok in self._command_tokens}
+
+        # set type tokens
+        self.type_tokens = [
+            TypeToken('str0', '<str0>', 0),
+            TypeToken('str1', '<str1>', 1),
+        ]
+        self.type_name_map = {tok.name: tok for tok in self.type_tokens}
+        self.type_token_map = {tok.token: tok for tok in self.type_tokens}
+        self.type_id_map = {tok.Id: tok for tok in self.type_tokens}
+
+        # parse tokens and vocabs from tokenizer
+
+        self._tokens = list(self.text_tokenizer.vocab.keys())
+        self._vocab = {k: v for k, v in self.text_tokenizer.vocab.items()}
+
+        self._text_tokens = list(self._tokens)
+        self._text_token_vocab = {
+            k: v
+            for k, v in self.text_tokenizer.vocab.items()
+        }
+
+        self._command_token_tokens = list(self.command_token_map.keys())
+        self._command_token_vocab = {
+            t: Id
+            for Id, t in self.command_id_map.items()
+        }
+
+        self._token_types = list(self.type_token_map.keys())
+        self._token_type_vocab = {t: Id for Id, t in self.type_id_map.items()}
+
+    def _encode(self, text):
+        tokens = self.text_tokenizer.tokenize(text)
+        ids = self.text_tokenizer.convert_tokens_to_ids(tokens)
+        return ids
+
+    def EncodeAsTokens(self, text, process_fn=None):
+        """convert wordpiece token to Id"""
+        processed_text = text
+        if process_fn is not None:
+            processed_text = process_fn(processed_text)
+        tokens = self.text_tokenizer.tokenize(processed_text)
+        return Tokenization(tokens, processed_text, text, asIds=False)
+
+    def IdToToken(self, Id, type_token=False):
+        """convert Id to sentencpiece token"""
+        if isinstance(Id, (TypeToken, CommandToken)):
+            return Id.token
+        if type_token:
+            return self.type_id_map[Id].token
+        if Id in self.command_id_map:
+            return self.command_id_map[Id].token
+        return self.text_tokenizer.ids_to_tokens[Id]
+
+    def TokenToId(self, token, type_token=False):
+        """convert sentencpiece token to Id"""
+        if isinstance(token, (TypeToken, CommandToken)):
+            return token.Id
+        if type_token:
+            return self.type_token_map[token].Id
+        return self.text_tokenizer.vocab[token]
+
+    def DecodeIds(self, Ids, type_token=False):
+        """converts ids to wordpiece tokens and joins them as a text string"""
+        if type_token:
+            return ' '.join(
+                Id.token if isinstance(Id, TypeToken) else self.
+                type_id_map[Id].token for Id in Ids)
+        if isinstance(Ids, Tokenization):
+            Ids = Ids.tokenization
+        Tokens = []
+        for Id in Ids:
+            if Id in self.command_id_map:
+                Tokens.append(self.command_id_map[Id].token)
+            elif Id in self.text_tokenizer.ids_to_tokens:
+                Tokens.append(self.text_tokenizer.ids_to_tokens[Id])
+        new_tokens = []
+        for token in Tokens:
+            if token.startswith('##') and len(new_tokens) > 0:
+                new_tokens[-1] += token[2:]
+            else:
+                new_tokens.append(token)
+        return ' '.join(new_tokens)
+
+    def DecodeTokens(self, Tokens, type_token=False):
+        """converts wordpiece tokens to a text string"""
+        if type_token:
+            return ' '.join(
+                t.token if isinstance(t, TypeToken) else t for t in Tokens)
+        if isinstance(Tokens, Tokenization):
+            Tokens = Tokens.tokenization
+        return ' '.join(Tokens)
+
+
+class GPT2BPETokenizer(Tokenizer):
+
+    def __init__(self,
+                 model_type_or_path,
+                 cache_dir=None,
+                 add_block_symbols=False,
+                 add_task_mask=False,
+                 add_decoder_mask=False,
+                 **kwargs):
+        self.text_tokenizer = GPT2Tokenizer.from_pretrained(
+            model_type_or_path, cache_dir=cache_dir)
+
+        # disable max len warnings by increasing max len
+        self.text_tokenizer.max_len = int(1e12)
+        self.num_tokens = len(self.text_tokenizer.encoder)
+        self.num_type_tokens = 2
+        if model_type_or_path.startswith('roberta'):
+            self.num_command_tokens = 6
+            self.num_text_tokens = self.num_tokens - 3
+            self._command_tokens = [
+                CommandToken('pad', '<|endoftext|>',
+                             self.text_tokenizer.encoder['</s>']),
+                CommandToken('eos', '<|endoftext|>',
+                             self.text_tokenizer.encoder['</s>']),
+                CommandToken('sep', '[SEP]',
+                             self.text_tokenizer.encoder['</s>']),
+                CommandToken('ENC', '[CLS]',
+                             self.text_tokenizer.encoder['<s>']),
+                CommandToken(
+                    'MASK',
+                    '[MASK]',
+                    self.text_tokenizer.encoder['<mask>'],
+                    lstrip=True),
+                CommandToken('unk', '[UNK]',
+                             self.text_tokenizer.encoder['<unk>'])
+            ]
+            if add_block_symbols:
+                self._command_tokens.extend([
+                    CommandToken('sop', '<|startofpiece|>', self.num_tokens),
+                    CommandToken('eop', '<|endofpiece|>', self.num_tokens + 1)
+                ])
+                self.num_tokens += 2
+                self.num_command_tokens += 2
+        else:
+            self.num_command_tokens = 2
+            self.num_text_tokens = self.num_tokens - 1
+            self._command_tokens = [
+                CommandToken('pad', '<|endoftext|>',
+                             self.text_tokenizer.encoder['<|endoftext|>']),
+                CommandToken('eos', '<|endoftext|>',
+                             self.text_tokenizer.encoder['<|endoftext|>'])
+            ]
+            if add_block_symbols:
+                self._command_tokens.extend([
+                    CommandToken('sop', '<|startofpiece|>', self.num_tokens),
+                    CommandToken('eop', '<|endofpiece|>', self.num_tokens + 1),
+                    CommandToken('ENC', '[CLS]', self.num_tokens + 2),
+                    CommandToken(
+                        'MASK', '[MASK]', self.num_tokens + 3, lstrip=True),
+                    CommandToken('sep', '[SEP]', self.num_tokens + 4),
+                    CommandToken('unk', '[UNK]', self.num_tokens + 5)
+                ])
+                self.num_tokens += 6
+                self.num_command_tokens += 6
+        if add_block_symbols:
+            if add_task_mask:
+                self._command_tokens.extend([
+                    CommandToken(
+                        'gMASK', '[gMASK]', self.num_tokens, lstrip=True),
+                    CommandToken(
+                        'sMASK', '[sMASK]', self.num_tokens + 1, lstrip=True)
+                ])
+                self.num_tokens += 2
+                self.num_command_tokens += 2
+            if add_decoder_mask:
+                self._command_tokens.extend(
+                    [CommandToken('dBLOCK', '[dBLOCK]', self.num_tokens)])
+                self.num_tokens += 1
+                self.num_command_tokens += 1
+        self.command_name_map = {tok.name: tok for tok in self._command_tokens}
+        self.command_token_map = {
+            tok.token: tok
+            for tok in self._command_tokens
+        }
+        self.command_id_map = {tok.Id: tok for tok in self._command_tokens}
+
+        self.type_tokens = [
+            TypeToken('str0', '<str0>', 0),
+            TypeToken('str1', '<str1>', 1),
+        ]
+        self.type_name_map = {tok.name: tok for tok in self.type_tokens}
+        self.type_token_map = {tok.token: tok for tok in self.type_tokens}
+        self.type_id_map = {tok.Id: tok for tok in self.type_tokens}
+
+        self._tokens = list(self.text_tokenizer.encoder.keys())
+        self._vocab = {k: v for k, v in self.text_tokenizer.encoder.items()}
+
+        self._text_tokens = list(self._tokens)
+        self._text_token_vocab = {
+            k: v
+            for k, v in self.text_tokenizer.encoder.items()
+        }
+
+        self._command_token_tokens = list(self.command_token_map.keys())
+        self._command_token_vocab = {
+            t: Id
+            for Id, t in self.command_id_map.items()
+        }
+
+        self._token_types = list(self.type_token_map.keys())
+        self._token_type_vocab = {t: Id for Id, t in self.type_id_map.items()}
+
+        for idx, tok in self.command_id_map.items():
+            self.text_tokenizer.decoder[idx] = tok.token
+
+    def EncodeAsIds(self, text, process_fn=None):
+        processed_text = text
+        if process_fn is not None:
+            processed_text = process_fn(processed_text)
+
+        def split_on_token(tok_extended: CommandToken, text):
+            result = []
+            tok = tok_extended.token
+            split_text = text.split(tok)
+            for i, sub_text in enumerate(split_text):
+                # CommandToken can control whitespace stripping around them.
+                # We use them for GPT2 and Roberta to have different behavior depending on the special token
+                # Cf. https://github.com/huggingface/transformers/pull/2778
+                # and https://github.com/huggingface/transformers/issues/3788
+                # Strip white spaces on the right
+                if tok_extended.rstrip and i > 0:
+                    # A bit counter-intuitive but we strip the left of the string
+                    # since tok_extended.rstrip means the special token is eating all white spaces on its right
+                    sub_text = sub_text.lstrip()
+                # Strip white spaces on the left
+                if tok_extended.lstrip and i < len(split_text) - 1:
+                    sub_text = sub_text.rstrip()  # Opposite here
+
+                if i == 0 and not sub_text:
+                    result.append(tok)
+                elif i == len(split_text) - 1:
+                    if sub_text:
+                        result.append(sub_text)
+                    else:
+                        pass
+                else:
+                    if sub_text:
+                        result.append(sub_text)
+                    result.append(tok)
+            return result
+
+        def split_on_tokens(tok_list, text):
+            if not text.strip():
+                return []
+            if not tok_list:
+                return self.text_tokenizer.encode(text)
+
+            tokenized_text = []
+            text_list = [text]
+            for tok in tok_list:
+                tokenized_text = []
+                for sub_text in text_list:
+                    if sub_text not in self._command_token_tokens:
+                        tokenized_text.extend(split_on_token(tok, sub_text))
+                    else:
+                        tokenized_text.append(sub_text)
+                text_list = tokenized_text
+
+            return list(
+                itertools.chain.from_iterable(
+                    (self.text_tokenizer.encode(token)
+                     if token not in self._command_token_tokens else
+                     [self.command_token_map[token].Id]
+                     for token in tokenized_text)))
+
+        no_split_tokens = self._command_tokens
+        Ids = split_on_tokens(no_split_tokens, processed_text)
+        tokenization = Tokenization(Ids, processed_text, text)
+        tokenization.set_command_tokens(self._command_tokens)
+        return tokenization
+
+    def _encode(self, text):
+        return self.text_tokenizer.encode(text)
+
+    def EncodeAsTokens(self, text, process_fn=None):
+        processed_text = text
+        if process_fn is not None:
+            processed_text = process_fn(processed_text)
+        tokens = []
+        for token in re.findall(self.text_tokenizer.pat, processed_text):
+            token = ''.join(self.text_tokenizer.bye_encoder[b]
+                            for b in token.encode('utf-8'))
+            tokens.extend(
+                bpe_token
+                for bpe_token in self.text_tokenizer.bpe(token).split(' '))
+        tokenization = Tokenization(tokens, processed_text, text, asIds=False)
+        tokenization.set_command_tokens(self._command_tokens)
+        return tokenization
+
+    def DecodeAsTokens(self, Ids):
+        return [self.IdToToken(x) for x in Ids]
+
+    def IdToToken(self, Id, type_token=False):
+        if isinstance(Id, (TypeToken, CommandToken)):
+            return Id.token
+        if type_token:
+            return self.type_id_map[Id].token
+        if Id in self.command_id_map:
+            return self.command_id_map[Id].token
+        return self.text_tokenizer.decoder[Id]
+
+    def TokenToId(self, token, type_token=False):
+        if isinstance(token, (TypeToken, CommandToken)):
+            return token.Id
+        if type_token:
+            return self.type_token_map[token].Id
+        return self.text_tokenizer.encoder[token]
+
+    def DecodeIds(self, Ids, type_token=False):
+        if type_token:
+            return ' '.join(
+                Id.token if isinstance(Id, TypeToken) else self.
+                type_id_map[Id].token for Id in Ids)
+        if isinstance(Ids, Tokenization):
+            Ids = Ids.tokenization
+        return self.text_tokenizer.decode(Ids)
+
+    def DecodeTokens(self, Tokens, type_token=False):
+        if type_token:
+            return ' '.join(
+                t.token if isinstance(t, TypeToken) else t for t in Tokens)
+        if isinstance(Tokens, Tokenization):
+            Tokens = Tokens.tokenization
+        return self.text_tokenizer.decode(
+            [self.TokenToId(tok) for tok in Tokens])
+
+
+class ChineseSPTokenizer(Tokenizer):
+
+    def __init__(self,
+                 model_path,
+                 add_block_symbols=False,
+                 add_task_mask=False,
+                 add_decoder_mask=False,
+                 **kwargs):
+        self.text_tokenizer = sp_tokenizer.from_pretrained(model_path)
+
+        self.num_command_tokens = 0
+        self.num_text_tokens = self.text_tokenizer.sp.vocab_size()
+        self.num_tokens = self.num_text_tokens
+        self.num_type_tokens = 2
+
+        self._command_tokens = [
+            CommandToken('pad', '<|endoftext|>', self.num_text_tokens),
+            CommandToken('eos', '<|endoftext|>', self.num_text_tokens),
+            CommandToken('sep', '[SEP]', self.num_text_tokens + 1),
+            CommandToken('ENC', '[CLS]', self.num_text_tokens + 2),
+            CommandToken(
+                'MASK', '[MASK]', self.num_text_tokens + 3, lstrip=True),
+            CommandToken('unk', '[UNK]', self.num_text_tokens + 4)
+        ]
+        self.num_tokens += 5
+        self.num_command_tokens += 6
+        if add_block_symbols:
+            self._command_tokens.extend([
+                CommandToken('sop', '<|startofpiece|>', self.num_tokens + 1),
+                CommandToken('eop', '<|endofpiece|>', self.num_tokens + 2)
+            ])
+            self.num_tokens += 2
+            self.num_command_tokens += 2
+            if add_task_mask:
+                self._command_tokens.extend([
+                    CommandToken(
+                        'gMASK', '[gMASK]', self.num_tokens, lstrip=True),
+                    CommandToken(
+                        'sMASK', '[sMASK]', self.num_tokens + 1, lstrip=True)
+                ])
+                self.num_tokens += 2
+                self.num_command_tokens += 2
+            if add_decoder_mask:
+                self._command_tokens.extend(
+                    [CommandToken('dBLOCK', '[dBLOCK]', self.num_tokens)])
+                self.num_tokens += 1
+                self.num_command_tokens += 1
+        self.command_name_map = {tok.name: tok for tok in self._command_tokens}
+        self.command_token_map = {
+            tok.token: tok
+            for tok in self._command_tokens
+        }
+        self.command_id_map = {tok.Id: tok for tok in self._command_tokens}
+
+        self.type_tokens = [
+            TypeToken('str0', '<str0>', 0),
+            TypeToken('str1', '<str1>', 1),
+        ]
+        self.type_name_map = {tok.name: tok for tok in self.type_tokens}
+        self.type_token_map = {tok.token: tok for tok in self.type_tokens}
+        self.type_id_map = {tok.Id: tok for tok in self.type_tokens}
+
+        # self._tokens = list(self.text_tokenizer.encoder.keys())
+        # self._vocab = {k:v for k,v in self.text_tokenizer.encoder.items()}
+        #
+        # self._text_tokens = list(self._tokens)
+        # self._text_token_vocab = {k:v for k,v in self.text_tokenizer.encoder.items()}
+
+        self._command_token_tokens = list(self.command_token_map.keys())
+        self._command_token_vocab = {
+            t: Id
+            for Id, t in self.command_id_map.items()
+        }
+
+        self._token_types = list(self.type_token_map.keys())
+        self._token_type_vocab = {t: Id for Id, t in self.type_id_map.items()}
+
+    def _encode(self, text):
+        ids = self.text_tokenizer.encode(text)
+        return ids
+
+    def EncodeAsTokens(self, text, process_fn=None):
+        processed_text = text
+        if process_fn is not None:
+            processed_text = process_fn(processed_text)
+        tokens = self.text_tokenizer.tokenize(processed_text)
+        tokenization = Tokenization(tokens, processed_text, text, asIds=False)
+        tokenization.set_command_tokens(self._command_tokens)
+        return tokenization
+        # return Tokenization(tokens, processed_text, text, asIds=False)
+
+    def IdToToken(self, Id, type_token=False):
+        if isinstance(Id, (TypeToken, CommandToken)):
+            return Id.token
+        if type_token:
+            return self.type_id_map[Id].token
+        if Id in self.command_id_map:
+            return self.command_id_map[Id].token
+        elif Id in self.type_id_map:
+            return self.type_id_map[Id].token
+        else:
+            return self.text_tokenizer.convert_id_to_token(int(Id))
+
+    def TokenToId(self, token, type_token=False):
+        if isinstance(token, (TypeToken, CommandToken)):
+            return token.Id
+        if type_token:
+            return self.type_token_map[token].Id
+        return self.text_tokenizer.convert_token_to_id(token)
+
+    def DecodeIds(self, Ids, type_token=False):
+        if type_token:
+            return ' '.join(
+                Id.token if isinstance(Id, TypeToken) else self.
+                type_id_map[Id].token for Id in Ids)
+        if isinstance(Ids, Tokenization):
+            Ids = Ids.tokenization
+        Ids = list(map(int, Ids))
+        pieces = []
+        last = 0
+        for i, token_id in enumerate(Ids):
+            if token_id in self.command_id_map:
+                pieces.append(Ids[last:i])
+                pieces.append(token_id)
+                last = i + 1
+        pieces.append(Ids[last:])
+        text = ''
+        for piece in pieces:
+            if isinstance(piece, int):
+                text += self.command_id_map[piece].token
+            elif piece:
+                text += self.text_tokenizer.decode(piece)
+        return text
+
+    def DecodeTokens(self, Tokens, type_token=False):
+        if type_token:
+            return ' '.join(
+                t.token if isinstance(t, TypeToken) else t for t in Tokens)
+        if isinstance(Tokens, Tokenization):
+            Tokens = Tokens.tokenization
+        return self.text_tokenizer.decode(
+            [self.TokenToId(tok) for tok in Tokens])
diff --git a/modelscope/models/nlp/mglm/data_utils/tokenization_gpt2.py b/modelscope/models/nlp/mglm/data_utils/tokenization_gpt2.py
new file mode 100644
index 00000000..d179e055
--- /dev/null
+++ b/modelscope/models/nlp/mglm/data_utils/tokenization_gpt2.py
@@ -0,0 +1,359 @@
+# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for OpenAI GPT."""
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+import logging
+import os
+import sys
+from io import open
+
+import json
+import regex as re
+
+from .file_utils import cached_path
+
+try:
+    from functools import lru_cache
+except ImportError:
+    # Just a dummy decorator to get the checks to run on python2
+    # because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now.
+    def lru_cache():
+        return lambda func: func
+
+
+logger = logging.getLogger(__name__)
+
+PRETRAINED_VOCAB_ARCHIVE_MAP = {
+    'gpt2': '.pytorch_pretrained_bert/gpt2-vocab.json',
+    'roberta': '.pytorch_pretrained_bert/roberta-vocab.json'
+}
+PRETRAINED_MERGES_ARCHIVE_MAP = {
+    'gpt2': '.pytorch_pretrained_bert/gpt2-merges.txt',
+    'roberta': '.pytorch_pretrained_bert/roberta-merges.txt'
+}
+PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
+    'gpt2': 1024,
+}
+VOCAB_NAME = 'vocab.json'
+MERGES_NAME = 'merges.txt'
+SPECIAL_TOKENS_NAME = 'special_tokens.txt'
+
+
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    _chr = unichr if sys.version_info[0] == 2 else chr
+    bs = list(range(ord('!'),
+                    ord('~') + 1)) + list(range(
+                        ord('¡'),
+                        ord('¬') + 1)) + list(range(ord('®'),
+                                                    ord('ÿ') + 1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [_chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+class GPT2Tokenizer(object):
+    """
+    GPT-2 BPE tokenizer. Peculiarities:
+        - Byte-level BPE
+    """
+
+    @classmethod
+    def from_pretrained(cls,
+                        pretrained_model_name_or_path,
+                        cache_dir=None,
+                        *inputs,
+                        **kwargs):
+        """
+        Instantiate a PreTrainedBertModel from a pre-trained model file.
+        Download and cache the pre-trained model file if needed.
+        """
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
+            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[
+                pretrained_model_name_or_path]
+            merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[
+                pretrained_model_name_or_path]
+            special_tokens_file = None
+        else:
+            vocab_file = os.path.join(pretrained_model_name_or_path,
+                                      VOCAB_NAME)
+            merges_file = os.path.join(pretrained_model_name_or_path,
+                                       MERGES_NAME)
+            special_tokens_file = os.path.join(pretrained_model_name_or_path,
+                                               SPECIAL_TOKENS_NAME)
+            if not os.path.exists(special_tokens_file):
+                special_tokens_file = None
+            else:
+                logger.info('loading special tokens file {}'.format(
+                    special_tokens_file))
+        # redirect to the cache, if necessary
+        # try:
+        #     resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
+        #     resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir)
+        # except EnvironmentError:
+        #     logger.error(
+        #         "Model name '{}' was not found in model name list ({}). "
+        #         "We assumed '{}' was a path or url but couldn't find files {} and {} "
+        #         "at this path or url.".format(
+        #             pretrained_model_name_or_path,
+        #             ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
+        #             pretrained_model_name_or_path,
+        #             vocab_file, merges_file))
+        #     return None
+        # if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file:
+        #     logger.info("loading vocabulary file {}".format(vocab_file))
+        #     logger.info("loading merges file {}".format(merges_file))
+        # else:
+        #     logger.info("loading vocabulary file {} from cache at {}".format(
+        #         vocab_file, resolved_vocab_file))
+        #     logger.info("loading merges file {} from cache at {}".format(
+        #         merges_file, resolved_merges_file))
+        resolved_vocab_file = vocab_file
+        resolved_merges_file = merges_file
+        logger.info('loading vocabulary file {}'.format(vocab_file))
+        logger.info('loading merges file {}'.format(merges_file))
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
+            # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
+            # than the number of positional embeddings
+            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[
+                pretrained_model_name_or_path]
+            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
+        # Instantiate tokenizer.
+        if special_tokens_file and 'special_tokens' not in kwargs:
+            special_tokens = open(
+                special_tokens_file, encoding='utf-8').read().split('\n')[:-1]
+        else:
+            special_tokens = kwargs.pop('special_tokens', [])
+        tokenizer = cls(
+            resolved_vocab_file,
+            resolved_merges_file,
+            special_tokens=special_tokens,
+            *inputs,
+            **kwargs)
+        return tokenizer
+
+    def __init__(self,
+                 vocab_file,
+                 merges_file,
+                 errors='replace',
+                 special_tokens=None,
+                 max_len=None):
+        self.max_len = max_len if max_len is not None else int(1e12)
+        self.encoder = json.load(open(vocab_file))
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.errors = errors  # how to handle errors in decoding
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
+        bpe_merges = [tuple(merge.split()) for merge in bpe_data]
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+        self.cache = {}
+
+        # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
+        self.pat = re.compile(
+            r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
+        )
+
+        self.special_tokens = {}
+        self.special_tokens_decoder = {}
+        self.set_special_tokens(special_tokens)
+
+    def __len__(self):
+        return len(self.encoder) + len(self.special_tokens)
+
+    def set_special_tokens(self, special_tokens):
+        """ Add a list of additional tokens to the encoder.
+            The additional tokens are indexed starting from the last index of the
+            current vocabulary in the order of the `special_tokens` list.
+        """
+        if not special_tokens:
+            self.special_tokens = {}
+            self.special_tokens_decoder = {}
+            return
+        self.special_tokens = dict((tok, len(self.encoder) + i)
+                                   for i, tok in enumerate(special_tokens))
+        self.special_tokens_decoder = {
+            v: k
+            for k, v in self.special_tokens.items()
+        }
+        logger.info('Special tokens {}'.format(self.special_tokens))
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token)
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token
+
+        while True:
+            bigram = min(
+                pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:  # noqa
+                    new_word.extend(word[i:])
+                    break
+
+                if word[i] == first and i < len(word) - 1 and word[
+                        i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+
+    def tokenize(self, text):
+        """ Tokenize a string. """
+        bpe_tokens = []
+        for token in re.findall(self.pat, text):
+            if sys.version_info[0] == 2:
+                token = ''.join(self.byte_encoder[ord(b)] for b in token)
+            else:
+                token = ''.join(self.byte_encoder[b]
+                                for b in token.encode('utf-8'))
+            bpe_tokens.extend(
+                bpe_token for bpe_token in self.bpe(token).split(' '))
+        return bpe_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        """ Converts a sequence of tokens into ids using the vocab. """
+        ids = []
+        if isinstance(tokens, str) or (sys.version_info[0] == 2
+                                       and isinstance(tokens, unicode)):
+            if tokens in self.special_tokens:
+                return self.special_tokens[tokens]
+            else:
+                return self.encoder.get(tokens, 0)
+        for token in tokens:
+            if token in self.special_tokens:
+                ids.append(self.special_tokens[token])
+            else:
+                ids.append(self.encoder.get(token, 0))
+        if len(ids) > self.max_len:
+            logger.warning(
+                'Token indices sequence length is longer than the specified maximum '
+                ' sequence length for this OpenAI GPT model ({} > {}). Running this'
+                ' sequence through the model will result in indexing errors'.
+                format(len(ids), self.max_len))
+        return ids
+
+    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
+        """Converts a sequence of ids in BPE tokens using the vocab."""
+        tokens = []
+        for i in ids:
+            if i in self.special_tokens_decoder:
+                if not skip_special_tokens:
+                    tokens.append(self.special_tokens_decoder[i])
+            else:
+                tokens.append(self.decoder[i])
+        return tokens
+
+    def encode(self, text):
+        return self.convert_tokens_to_ids(self.tokenize(text))
+
+    def decode(self, tokens):
+        text = ''.join([self.decoder[token] for token in tokens])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode(
+            'utf-8', errors=self.errors)
+        return text
+
+    def save_vocabulary(self, vocab_path):
+        """Save the tokenizer vocabulary and merge files to a directory."""
+        if not os.path.isdir(vocab_path):
+            logger.error('Vocabulary path ({}) should be a directory'.format(
+                vocab_path))
+            return
+        vocab_file = os.path.join(vocab_path, VOCAB_NAME)
+        merge_file = os.path.join(vocab_path, MERGES_NAME)
+        special_tokens_file = os.path.join(vocab_path, SPECIAL_TOKENS_NAME)
+
+        with open(vocab_file, 'w', encoding='utf-8') as f:
+            f.write(json.dumps(self.encoder, ensure_ascii=False))
+
+        index = 0
+        with open(merge_file, 'w', encoding='utf-8') as writer:
+            writer.write(u'#version: 0.2\n')
+            for bpe_tokens, token_index in sorted(
+                    self.bpe_ranks.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        'Saving vocabulary to {}: BPE merge indices are not consecutive.'
+                        ' Please check that the tokenizer is not corrupted!'.
+                        format(merge_file))
+                    index = token_index
+                writer.write(' '.join(bpe_tokens) + u'\n')
+                index += 1
+
+        index = len(self.encoder)
+        with open(special_tokens_file, 'w', encoding='utf-8') as writer:
+            for token, token_index in sorted(
+                    self.special_tokens.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        'Saving special tokens vocabulary to {}: BPE indices are not consecutive.'
+                        ' Please check that the tokenizer is not corrupted!'.
+                        format(special_tokens_file))
+                    index = token_index
+                writer.write(token + u'\n')
+                index += 1
+
+        return vocab_file, merge_file, special_tokens_file
diff --git a/modelscope/models/nlp/mglm/data_utils/wordpiece.py b/modelscope/models/nlp/mglm/data_utils/wordpiece.py
new file mode 100755
index 00000000..1cecffbd
--- /dev/null
+++ b/modelscope/models/nlp/mglm/data_utils/wordpiece.py
@@ -0,0 +1,408 @@
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes. Provided as is from https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/tokenization.py"""  # noqa
+
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+import collections
+import logging
+import os
+import unicodedata
+from io import open
+
+from .file_utils import cached_path
+
+logger = logging.getLogger(__name__)
+
+PRETRAINED_VOCAB_ARCHIVE_MAP = {
+    'bert-base-uncased':
+    '.pytorch_pretrained_bert/bert-base-uncased-vocab.txt',
+    'bert-large-uncased':
+    '.pytorch_pretrained_bert/bert-large-uncased-vocab.txt',
+    'bert-base-cased':
+    '.pytorch_pretrained_bert/bert-base-cased-vocab.txt',
+    'bert-large-cased':
+    '.pytorch_pretrained_bert/bert-large-cased-vocab.txt',
+    'bert-base-multilingual-uncased':
+    'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt',
+    'bert-base-multilingual-cased':
+    'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt',
+    'bert-base-chinese':
+    'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt',
+}
+PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
+    'bert-base-uncased': 512,
+    'bert-large-uncased': 512,
+    'bert-base-cased': 512,
+    'bert-large-cased': 512,
+    'bert-base-multilingual-uncased': 512,
+    'bert-base-multilingual-cased': 512,
+    'bert-base-chinese': 512,
+}
+VOCAB_NAME = 'vocab.txt'
+
+
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    index = 0
+    with open(vocab_file, 'r', encoding='utf-8') as reader:
+        while True:
+            token = reader.readline()
+            if not token:
+                break
+            token = token.strip()
+            vocab[token] = index
+            index += 1
+    return vocab
+
+
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a piece of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+
+
+class BertTokenizer(object):
+    """Runs end-to-end tokenization: punctuation splitting + wordpiece"""
+
+    def __init__(self,
+                 vocab_file,
+                 do_lower_case=True,
+                 max_len=None,
+                 do_basic_tokenize=True,
+                 never_split=('[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]')):
+        """Constructs a BertTokenizer.
+
+        Args:
+          vocab_file: Path to a one-wordpiece-per-line vocabulary file
+          do_lower_case: Whether to lower case the input
+                         Only has an effect when do_wordpiece_only=False
+          do_basic_tokenize: Whether to do basic tokenization before wordpiece.
+          max_len: An artificial maximum length to truncate tokenized sequences to;
+                         Effective maximum length is always the minimum of this
+                         value (if specified) and the underlying BERT model's
+                         sequence length.
+          never_split: List of tokens which will never be split during tokenization.
+                         Only has an effect when do_wordpiece_only=False
+        """
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
+                'model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`'
+                .format(vocab_file))
+        self.vocab = load_vocab(vocab_file)
+        self.ids_to_tokens = collections.OrderedDict([
+            (ids, tok) for tok, ids in self.vocab.items()
+        ])
+        self.do_basic_tokenize = do_basic_tokenize
+        if do_basic_tokenize:
+            self.basic_tokenizer = BasicTokenizer(
+                do_lower_case=do_lower_case, never_split=never_split)
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+        self.max_len = max_len if max_len is not None else int(1e12)
+
+    def tokenize(self, text):
+        if self.do_basic_tokenize:
+            split_tokens = []
+            for token in self.basic_tokenizer.tokenize(text):
+                for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                    split_tokens.append(sub_token)
+        else:
+            split_tokens = self.wordpiece_tokenizer.tokenize(text)
+        return split_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        """Converts a sequence of tokens into ids using the vocab."""
+        ids = []
+        for token in tokens:
+            ids.append(self.vocab[token])
+        if len(ids) > self.max_len:
+            logger.warning(
+                'Token indices sequence length is longer than the specified maximum '
+                ' sequence length for this BERT model ({} > {}). Running this'
+                ' sequence through BERT will result in indexing errors'.format(
+                    len(ids), self.max_len))
+        return ids
+
+    def convert_ids_to_tokens(self, ids):
+        """Converts a sequence of ids in wordpiece tokens using the vocab."""
+        tokens = []
+        for i in ids:
+            tokens.append(self.ids_to_tokens[i])
+        return tokens
+
+    @classmethod
+    def from_pretrained(cls,
+                        pretrained_model_name_or_path,
+                        cache_dir=None,
+                        *inputs,
+                        **kwargs):
+        """
+        Instantiate a PreTrainedBertModel from a pre-trained model file.
+        Download and cache the pre-trained model file if needed.
+        """
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
+            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[
+                pretrained_model_name_or_path]
+        else:
+            vocab_file = pretrained_model_name_or_path
+        if os.path.isdir(vocab_file):
+            vocab_file = os.path.join(vocab_file, VOCAB_NAME)
+        # redirect to the cache, if necessary
+        try:
+            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            logger.error(
+                "Model name '{}' was not found in model name list ({}). "
+                "We assumed '{}' was a path or url but couldn't find any file "
+                'associated to this path or url.'.format(
+                    pretrained_model_name_or_path,
+                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
+                    vocab_file))
+            return None
+        if resolved_vocab_file == vocab_file:
+            logger.info('loading vocabulary file {}'.format(vocab_file))
+        else:
+            logger.info('loading vocabulary file {} from cache at {}'.format(
+                vocab_file, resolved_vocab_file))
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
+            # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
+            # than the number of positional embeddings
+            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[
+                pretrained_model_name_or_path]
+            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
+        # Instantiate tokenizer.
+        tokenizer = cls(resolved_vocab_file, *inputs, **kwargs)
+        return tokenizer
+
+
+class BasicTokenizer(object):
+    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
+
+    def __init__(self,
+                 do_lower_case=True,
+                 never_split=('[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]')):
+        """Constructs a BasicTokenizer.
+
+        Args:
+          do_lower_case: Whether to lower case the input.
+        """
+        self.do_lower_case = do_lower_case
+        self.never_split = never_split
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text."""
+        text = self._clean_text(text)
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        text = self._tokenize_chinese_chars(text)
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        for token in orig_tokens:
+            if self.do_lower_case and token not in self.never_split:
+                token = token.lower()
+                token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token))
+
+        output_tokens = whitespace_tokenize(' '.join(split_tokens))
+        return output_tokens
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize('NFD', text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == 'Mn':
+                continue
+            output.append(char)
+        return ''.join(output)
+
+    def _run_split_on_punc(self, text):
+        """Splits punctuation on a piece of text."""
+        if text in self.never_split:
+            return [text]
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return [''.join(x) for x in output]
+
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(' ')
+                output.append(char)
+                output.append(' ')
+            else:
+                output.append(char)
+        return ''.join(output)
+
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  # noqa
+            (cp >= 0x3400 and cp <= 0x4DBF) or  # noqa
+            (cp >= 0x20000 and cp <= 0x2A6DF) or  # noqa
+            (cp >= 0x2A700 and cp <= 0x2B73F) or  # noqa
+            (cp >= 0x2B740 and cp <= 0x2B81F) or  # noqa
+            (cp >= 0x2B820 and cp <= 0x2CEAF) or  # noqa
+            (cp >= 0xF900 and cp <= 0xFAFF) or  # noqa
+            (cp >= 0x2F800 and cp <= 0x2FA1F)):  # noqa
+            return True
+
+        return False
+
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xfffd or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(' ')
+            else:
+                output.append(char)
+        return ''.join(output)
+
+
+class WordpieceTokenizer(object):
+    """Runs WordPiece tokenization."""
+
+    def __init__(self, vocab, unk_token='[UNK]', max_input_chars_per_word=100):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text into its word pieces.
+
+        This uses a greedy longest-match-first algorithm to perform tokenization
+        using the given vocabulary.
+
+        For example:
+          input = "unaffable"
+          output = ["un", "##aff", "##able"]
+
+        Args:
+          text: A single token or whitespace separated tokens. This should have
+            already been passed through `BasicTokenizer`.
+
+        Returns:
+          A list of wordpiece tokens.
+        """
+
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = ''.join(chars[start:end])
+                    if start > 0:
+                        substr = '##' + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+
+
+def _is_whitespace(char):
+    """Checks whether `chars` is a whitespace character."""
+    # \t, \n, and \r are technically contorl characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == ' ' or char == '\t' or char == '\n' or char == '\r':
+        return True
+    cat = unicodedata.category(char)
+    if cat == 'Zs':
+        return True
+    return False
+
+
+def _is_control(char):
+    """Checks whether `chars` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == '\t' or char == '\n' or char == '\r':
+        return False
+    cat = unicodedata.category(char)
+    if cat.startswith('C'):
+        return True
+    return False
+
+
+def _is_punctuation(char):
+    """Checks whether `chars` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64)
+            or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith('P'):
+        return True
+    return False
diff --git a/modelscope/models/nlp/mglm/fp16/__init__.py b/modelscope/models/nlp/mglm/fp16/__init__.py
new file mode 100644
index 00000000..90d20bcf
--- /dev/null
+++ b/modelscope/models/nlp/mglm/fp16/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .fp16 import *  # noqa
+from .fp16util import (BN_convert_float, FP16Model, clip_grad_norm,
+                       convert_module, convert_network,
+                       master_params_to_model_params,
+                       model_grads_to_master_grads, network_to_half,
+                       prep_param_lists, to_python_float, tofp16)
+from .loss_scaler import *  # noqa
diff --git a/modelscope/models/nlp/mglm/fp16/fp16.py b/modelscope/models/nlp/mglm/fp16/fp16.py
new file mode 100755
index 00000000..10fbd804
--- /dev/null
+++ b/modelscope/models/nlp/mglm/fp16/fp16.py
@@ -0,0 +1,660 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Stable version of apex FP16 Optimizer"""
+import torch
+from torch import nn
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+from torch.autograd import Variable
+from torch.nn.parameter import Parameter
+
+from .fp16util import (clip_grad_norm, master_params_to_model_params,
+                       model_grads_to_master_grads)
+from .loss_scaler import DynamicLossScaler, LossScaler
+
+FLOAT_TYPES = (torch.FloatTensor, torch.cuda.FloatTensor)
+HALF_TYPES = (torch.HalfTensor, torch.cuda.HalfTensor)
+
+
+def conversion_helper(val, conversion):
+    """Apply conversion to val. Recursively apply conversion if `val` is a nested tuple/list structure."""
+    if not isinstance(val, (tuple, list)):
+        return conversion(val)
+    rtn = [conversion_helper(v, conversion) for v in val]
+    if isinstance(val, tuple):
+        rtn = tuple(rtn)
+    return rtn
+
+
+def fp32_to_fp16(val):
+    """Convert fp32 `val` to fp16"""
+
+    def half_conversion(val):
+        val_typecheck = val
+        if isinstance(val_typecheck, (Parameter, Variable)):
+            val_typecheck = val.data
+        if isinstance(val_typecheck, FLOAT_TYPES):
+            val = val.half()
+        return val
+
+    return conversion_helper(val, half_conversion)
+
+
+def fp16_to_fp32(val):
+    """Convert fp16 `val` to fp32"""
+
+    def float_conversion(val):
+        val_typecheck = val
+        if isinstance(val_typecheck, (Parameter, Variable)):
+            val_typecheck = val.data
+        if isinstance(val_typecheck, HALF_TYPES):
+            val = val.float()
+        return val
+
+    return conversion_helper(val, float_conversion)
+
+
+class FP16_Module(nn.Module):
+
+    def __init__(self, module):
+        super(FP16_Module, self).__init__()
+        self.add_module('module', module.half())
+
+    def forward(self, *inputs, **kwargs):
+        return fp16_to_fp32(self.module(*(fp32_to_fp16(inputs)), **kwargs))
+
+    def named_parameters(self, prefix: str = '', recurse: bool = True):
+        return self.module.named_parameters(prefix=prefix, recurse=recurse)
+
+    def state_dict(self, destination=None, prefix='', keep_vars=False):
+        return self.module.state_dict(destination, prefix, keep_vars)
+
+    def load_state_dict(self, state_dict, strict=True):
+        return self.module.load_state_dict(state_dict, strict=strict)
+
+
+# TODO:  Update overflow check + downscale to use Carl's fused kernel.
+class FP16_Optimizer(object):
+    """
+    :class:`FP16_Optimizer` is designed to wrap an existing PyTorch optimizer,
+    and manage static or dynamic loss scaling and master weights in a manner transparent to the user.
+    For standard use, only two lines must be changed:  creating the :class:`FP16_Optimizer` instance,
+    and changing the call to ``backward``.
+
+    Example::
+
+        model = torch.nn.Linear(D_in, D_out).cuda().half()
+        optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
+        # Name the FP16_Optimizer instance to replace the existing optimizer
+        # (recommended but not required):
+        optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
+        ...
+        # loss.backward() becomes:
+        optimizer.backward(loss)
+        ...
+
+    Example with dynamic loss scaling::
+
+        ...
+        optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
+                                   # optional arg to control dynamic loss scaling behavior
+                                   # dynamic_loss_args={'scale_window' : 500})
+                                   # Usually, dynamic_loss_args is not necessary.
+
+    Args:
+        init_optimizer (torch.optim.optimizer):  Existing optimizer created with the parameters to optimize.  Internally, :class:`FP16_Optimizer` replaces the passed optimizer's fp16 parameters, if any, with fp32 master parameters copied from the original ones.  :class:`FP16_Optimizer` also stores references to the original fp16 parameters, and updates these fp16 parameters from the master fp32 copy at the end of each :attr:`step`.
+        static_loss_scale (float, optional, default=1.0):  Loss scale used internally to scale gradients computed by the model.  Any fp16 gradients will be copied to fp32, then downscaled before being applied to the fp32 master params, so ``static_loss_scale`` should not affect learning rate.
+        dynamic_loss_scale (bool, optional, default=False):  Use dynamic loss scaling.  If True, this will override any ``static_loss_scale`` option.
+        dynamic_loss_args (dict, optional, default=None):  Dict of kwargs that will be forwarded to the internal :class:`DynamicLossScaler` instance's constructor.  Keys of this dict must match kwargs accepted by :class:`DynamicLossScaler`'s constructor.  If ``dynamic_loss_args`` is unspecified, :class:`DynamicLossScaler`'s defaults will be used.
+        verbose (bool, optional, default=True):  By default, FP16_Optimizer's constructor prints out the parameters and parameter groups it is ingesting, as a sanity check.  If this becomes annoying (e.g. for large models), it can be disabled by passing ``verbose=False``.  ``verbose=False`` will not disable printing when the loss scale is readjusted during dynamic loss scaling.
+
+    ``init_optimizer`` is expected to have been constructed in the ordinary way.
+    It is recommended (although not required) that the newly constructed :class:`FP16_Optimizer` instance be
+    named to replace ``init_optimizer``, for two reasons:
+    First, it means that references to the same name
+    later in the file will not have to change.
+    Second, :class:`FP16_Optimizer` reserves the right (as an implementation detail) to
+    modify ``init_optimizer``.  If you do choose a unique name for the new
+    :class:`FP16_Optimizer` instance, you should only work with this new instance,
+    because the preexisting optimizer might no longer behave as expected.
+
+    ``init_optimizer`` may be any Pytorch optimizer.
+    It may contain a mixture of fp16 and fp32 parameters organized into any number of
+    ``param_groups`` with different hyperparameters.  The :class:`FP16_Optimizer` constructor will
+    ingest these ``param_groups`` and remember them.
+
+    Calls to ::
+
+        loss.backward()
+
+    must be replaced with ::
+
+        optimizer.backward(loss)
+
+    because :class:`FP16_Optimizer` requires ownership of the backward pass to implement
+    loss scaling and copies to master gradients.
+
+    .. note::
+        Loss scaling, either static or dynamic, is orthogonal to learning rate, because gradients
+        are downscaled before being applied.  This means that adjusting the loss scale, or using
+        dynamic loss scaling, should not require retuning the learning rate or any other
+        hyperparameters.
+
+
+    **Advanced options**
+
+    **Closures**:  :class:`FP16_Optimizer` can wrap a Pytorch optimizer that receives a closure.
+    See docstring for :attr:`step`.
+
+    **Gradient clipping**:  Use :attr:`clip_master_grads`.
+
+    **Multiple losses**:  If your model accumulates gradients from multiple losses,
+    this can be made more efficient by supplying ``update_master_grads=False``
+    to :attr:`backward`.  See docstring for :attr:`backward`.
+
+    **Manually adjusting loss scale**:  The current loss scale can be retrieved or set via ::
+
+        print(optimizer.loss_scale)
+        optimizer.loss_scale = new_loss_scale
+
+    For static loss scaling, manually adjusting the loss scale over time is a reasonable
+    thing to do.  During later epochs, gradients may become smaller, and a
+    higher loss scale may be required, analogous to scheduling the learning rate.  Dynamic loss
+    scaling is more subtle (see :class:`DynamicLossScaler`) and in this case, manually adjusting
+    the loss scale is not recommended.
+
+    **Multi_GPU training**:  If the wrapped ``init_optimizer`` was created from a model wrapped in
+    Pytorch DistributedDataParallel or Apex DistributedDataParallel, :class:`FP16_Optimizer`
+    should still work as intended.
+    """ # noqa
+
+    def __init__(self,
+                 init_optimizer,
+                 static_loss_scale=1.0,
+                 dynamic_loss_scale=False,
+                 dynamic_loss_args=None,
+                 verbose=False):
+        if not torch.cuda.is_available:
+            raise SystemError('Cannot use fp16 without CUDA.')
+
+        self.verbose = verbose
+
+        self.optimizer = init_optimizer
+        # init_state_dict sets up an alternative way to cast per-param state tensors.
+        # Stashing here in case https://github.com/pytorch/pytorch/issues/7733 makes it necessary.
+        # init_state_dict = init_optimizer.state_dict()
+
+        self.fp16_groups = []
+        self.fp32_from_fp16_groups = []
+        self.fp32_from_fp32_groups = []
+        for i, param_group in enumerate(self.optimizer.param_groups):
+            self.maybe_print(
+                'FP16_Optimizer processing param group {}:'.format(i))
+            fp16_params_this_group = []
+            fp32_params_this_group = []
+            fp32_from_fp16_params_this_group = []
+            for i, param in enumerate(param_group['params']):
+                if param.requires_grad:
+                    if param.type() == 'torch.cuda.HalfTensor':
+                        self.maybe_print(
+                            'FP16_Optimizer received torch.cuda.HalfTensor with {}'
+                            .format(param.size()))
+                        fp16_params_this_group.append(param)
+                        master_param = param.detach().clone().float()
+                        master_param.requires_grad = True
+                        # Copythe model parallel flag.
+                        master_param.model_parallel = param.model_parallel
+                        param_group['params'][i] = master_param
+                        fp32_from_fp16_params_this_group.append(master_param)
+                        # Reset existing state dict key to the new master param.
+                        # We still need to recast per-param state tensors, if any, to FP32.
+                        if param in self.optimizer.state:
+                            self.optimizer.state[
+                                master_param] = self.optimizer.state.pop(param)
+                    elif param.type() == 'torch.cuda.FloatTensor':
+                        self.maybe_print(
+                            'FP16_Optimizer received torch.cuda.FloatTensor with {}'
+                            .format(param.size()))
+                        fp32_params_this_group.append(param)
+                        param_group['params'][i] = param
+                    else:
+                        raise TypeError(
+                            'Wrapped parameters must be either '
+                            'torch.cuda.FloatTensor or torch.cuda.HalfTensor. '
+                            'Received {}'.format(param.type()))
+
+            self.fp16_groups.append(fp16_params_this_group)
+            self.fp32_from_fp16_groups.append(fp32_from_fp16_params_this_group)
+            self.fp32_from_fp32_groups.append(fp32_params_this_group)
+
+        # Leverage state_dict() and load_state_dict() to recast preexisting per-param state tensors
+        self.optimizer.load_state_dict(self.optimizer.state_dict())
+        # alternative way to cast per-param state tensors:
+        # self.optimizer.load_state_dict(init_state_dict)
+
+        if dynamic_loss_scale:
+            self.dynamic_loss_scale = True
+            if dynamic_loss_args is not None:
+                self.loss_scaler = DynamicLossScaler(**dynamic_loss_args)
+            else:
+                self.loss_scaler = DynamicLossScaler()
+        else:
+            self.dynamic_loss_scale = False
+            self.loss_scaler = LossScaler(static_loss_scale)
+
+        self.overflow = False
+        self.first_closure_call_this_step = True
+
+        self.clip_grad_norm = clip_grad_norm
+
+    def maybe_print(self, msg):
+        if self.verbose:
+            print(msg)
+
+    def __getstate__(self):
+        raise RuntimeError(
+            'FP16_Optimizer should be serialized using state_dict().')
+
+    def __setstate__(self, state):
+        raise RuntimeError(
+            'FP16_Optimizer should be deserialized using load_state_dict().')
+
+    def zero_grad(self, set_grads_to_None=False):
+        """
+        Zero fp32 and fp16 parameter grads.
+        """
+        # In principle, only the .grad attributes of the model params need to be zeroed,
+        # because gradients are copied into the FP32 master params.  However, we zero
+        # all gradients owned by the optimizer, just to be safe:
+        for group in self.optimizer.param_groups:
+            for p in group['params']:
+                if set_grads_to_None:
+                    p.grad = None
+                else:
+                    if p.grad is not None:
+                        p.grad.detach_()
+                        p.grad.zero_()
+
+        # Zero fp16 gradients owned by the model:
+        for fp16_group in self.fp16_groups:
+            for param in fp16_group:
+                if set_grads_to_None:
+                    param.grad = None
+                else:
+                    if param.grad is not None:
+                        param.grad.detach_(
+                        )  # as in torch.optim.optimizer.zero_grad()
+                        param.grad.zero_()
+
+    def _check_overflow(self):
+        params = []
+        for group in self.fp16_groups:
+            for param in group:
+                params.append(param)
+        for group in self.fp32_from_fp32_groups:
+            for param in group:
+                params.append(param)
+        self.overflow = self.loss_scaler.has_overflow(params)
+
+    def _update_scale(self, has_overflow=False):
+        self.loss_scaler.update_scale(has_overflow)
+
+    def _master_params_to_model_params(self):
+        for fp16_group, fp32_from_fp16_group in zip(
+                self.fp16_groups, self.fp32_from_fp16_groups):
+            master_params_to_model_params(fp16_group, fp32_from_fp16_group)
+
+    def _model_params_to_master_params(self):
+        for fp16_group, fp32_from_fp16_group in zip(
+                self.fp16_groups, self.fp32_from_fp16_groups):
+            master_params_to_model_params(fp32_from_fp16_group, fp16_group)
+
+    # To consider:  Integrate distributed with this wrapper by registering a hook on each variable
+    # that does the overflow check, gradient copy + downscale, and fp32 allreduce in a different stream.
+    def _model_grads_to_master_grads(self):
+        for fp16_group, fp32_from_fp16_group in zip(
+                self.fp16_groups, self.fp32_from_fp16_groups):
+            model_grads_to_master_grads(fp16_group, fp32_from_fp16_group)
+
+    def _downscale_master(self):
+        if self.loss_scale != 1.0:
+            for group in self.optimizer.param_groups:
+                for param in group['params']:
+                    if param.grad is not None:
+                        param.grad.data.mul_(1. / self.loss_scale)
+
+    def clip_master_grads(self, max_norm, norm_type=2):
+        """
+        Clips fp32 master gradients via ``torch.nn.utils.clip_grad_norm``.
+
+        Args:
+            max_norm (float or int): max norm of the gradients
+            norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
+                infinity norm.
+
+        Returns:
+            Total norm of the current fp32 gradients (viewed as a single vector).
+
+        .. warning::
+            Returns -1 if the most recently computed fp16 gradients overflowed (that is, if ``self.overflow`` is ``True``).
+        """ # noqa
+        if not self.overflow:
+            fp32_params = []
+            for param_group in self.optimizer.param_groups:
+                for param in param_group['params']:
+                    fp32_params.append(param)
+            return self.clip_grad_norm(fp32_params, max_norm, norm_type)
+        else:
+            return -1
+
+    def state_dict(self):
+        """
+        Returns a dict containing the current state of this :class:`FP16_Optimizer` instance.
+        This dict contains attributes of :class:`FP16_Optimizer`, as well as the state_dict
+        of the contained Pytorch optimizer.
+        Example::
+
+            checkpoint = {}
+            checkpoint['model'] = model.state_dict()
+            checkpoint['optimizer'] = optimizer.state_dict()
+            torch.save(checkpoint, "saved.pth")
+        """
+        state_dict = {}
+        state_dict['loss_scaler'] = self.loss_scaler
+        state_dict['dynamic_loss_scale'] = self.dynamic_loss_scale
+        state_dict['overflow'] = self.overflow
+        state_dict[
+            'first_closure_call_this_step'] = self.first_closure_call_this_step
+        state_dict['optimizer_state_dict'] = self.optimizer.state_dict()
+        state_dict['fp32_from_fp16'] = self.fp32_from_fp16_groups
+        return state_dict
+
+    def load_state_dict(self, state_dict):
+        """
+        Loads a state_dict created by an earlier call to state_dict().
+        If ``fp16_optimizer_instance`` was constructed from some ``init_optimizer``,
+        whose parameters in turn came from ``model``, it is expected that the user
+        will call ``model.load_state_dict()`` before
+        ``fp16_optimizer_instance.load_state_dict()`` is called.
+
+        Example::
+
+            model = torch.nn.Linear(D_in, D_out).cuda().half()
+            optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
+            optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
+            ...
+            checkpoint = torch.load("saved.pth")
+            model.load_state_dict(checkpoint['model'])
+            optimizer.load_state_dict(checkpoint['optimizer'])
+        """
+        # I think it should actually be ok to reload the optimizer before the model.
+        self.loss_scaler = state_dict['loss_scaler']
+        self.dynamic_loss_scale = state_dict['dynamic_loss_scale']
+        self.overflow = state_dict['overflow']
+        self.first_closure_call_this_step = state_dict[
+            'first_closure_call_this_step']
+        self.optimizer.load_state_dict(state_dict['optimizer_state_dict'])
+        # At this point, the optimizer's references to the model's fp32 parameters are up to date.
+        # The optimizer's hyperparameters and internal buffers are also up to date.
+        # However, the fp32 master copies of the model's fp16 params stored by the optimizer are still
+        # out of date.  There are two options.
+        # 1:  Refresh the master params from the model's fp16 params.
+        # This requires less storage but incurs precision loss.
+        # 2:  Save and restore the fp32 master copies separately.
+        # We choose option 2.
+        #
+        # Pytorch Optimizer.load_state_dict casts saved buffers (e.g. momentum) to the type and device
+        # of their associated parameters, because it's possible those buffers might not exist yet in
+        # the current optimizer instance.  In our case, as long as the current FP16_Optimizer has been
+        # constructed in the same way as the one whose state_dict we are loading, the same master params
+        # are guaranteed to exist, so we can just copy_() from the saved master params.
+        for current_group, saved_group in zip(self.fp32_from_fp16_groups,
+                                              state_dict['fp32_from_fp16']):
+            for current, saved in zip(current_group, saved_group):
+                current.data.copy_(saved.data)
+
+    def step(self, closure=None):  # could add clip option.
+        """
+        If no closure is supplied, :attr:`step` should be called after
+        ``fp16_optimizer_obj.backward(loss)``.
+        :attr:`step` updates the fp32 master copy of parameters using the optimizer supplied to
+        :class:`FP16_Optimizer`'s constructor, then copies the updated fp32 params into the fp16 params
+        originally referenced by :class:`FP16_Optimizer`'s constructor, so the user may immediately run
+        another forward pass using their model.
+
+        If a closure is supplied, :attr:`step` may be called without a prior call to
+        :attr:`backward(loss)`.
+        This control flow is identical to `ordinary Pytorch optimizer use`_ with closures.
+        However, the user should take care that any ``loss.backward()`` call within the closure
+        has been replaced by ``fp16_optimizer_obj.backward(loss)``.
+
+        Args:
+           closure (optional):  Closure that will be supplied to the underlying optimizer originally passed to :class:`FP16_Optimizer`'s constructor.  closure should call :attr:`zero_grad()` on the :class:`FP16_Optimizer` object, compute the loss, call :attr:`backward(loss)`, and return the loss.
+
+        Example with closure::
+
+            # optimizer is assumed to be an FP16_Optimizer object, previously constructed from an
+            # existing pytorch optimizer.
+            for input, target in dataset:
+                def closure():
+                    optimizer.zero_grad()
+                    output = model(input)
+                    loss = loss_fn(output, target)
+                    # loss.backward() becomes:
+                    optimizer.backward(loss)
+                    return loss
+                optimizer.step(closure)
+
+        .. warning::
+            Currently, calling :attr:`step` with a closure is not compatible with dynamic loss scaling.
+
+        .. _`ordinary Pytorch optimizer use`:
+            http://pytorch.org/docs/master/optim.html#optimizer-step-closure
+        """ # noqa
+
+        scale = self.loss_scaler.loss_scale
+        self._update_scale(self.overflow)
+
+        if self.overflow:
+            self.maybe_print(
+                'OVERFLOW! Skipping step. Attempted loss scale: {}, reducing to {}'
+                .format(scale, self.loss_scale))
+            return
+
+        if closure is not None:
+            retval = self._step_with_closure(closure)
+        else:
+            retval = self.optimizer.step()
+
+        self._master_params_to_model_params()
+
+        return retval
+
+    def _step_with_closure(self, closure):
+
+        def wrapped_closure():
+            # helpful for debugging
+            # print("Calling wrapped_closure, first_closure_call_this_step = {}"
+            #       .format(self.first_closure_call_this_step))
+            if self.first_closure_call_this_step:
+                # We expect that the fp16 params are initially fresh on entering self.step(),
+                # so _master_params_to_model_params() is unnecessary the first time wrapped_closure()
+                # is called within self.optimizer.step().
+                self.first_closure_call_this_step = False
+            else:
+                # If self.optimizer.step() internally calls wrapped_closure more than once,
+                # it may update the fp32 params after each call.  However, self.optimizer
+                # doesn't know about the fp16 params at all.  If the fp32 params get updated,
+                # we can't rely on self.optimizer to refresh the fp16 params.  We need
+                # to handle that manually:
+                self._master_params_to_model_params()
+            # Our API expects the user to give us ownership of the backward() call by
+            # replacing all calls to loss.backward() with optimizer.backward(loss).
+            # This requirement holds whether or not the call to backward() is made within a closure.
+            # If the user is properly calling optimizer.backward(loss) within "closure,"
+            # calling closure() here will give the fp32 master params fresh gradients
+            # for the optimizer to play with, so all wrapped_closure needs to do is call
+            # closure() and return the loss.
+            temp_loss = closure()
+            while (self.overflow):
+                scale = self.loss_scaler.loss_scale
+                self._update_scale(self.overflow)
+                self.maybe_print(
+                    'OVERFLOW within closure! Skipping step. Attempted loss scale: {}, '
+                    'reducing to {}'.format(scale, self.loss_scale))
+                temp_loss = closure()
+            return temp_loss
+
+        retval = self.optimizer.step(wrapped_closure)
+
+        self.first_closure_call_this_step = True
+
+        return retval
+
+    def backward(self, loss, update_master_grads=True, retain_graph=False):
+        """
+        :attr:`backward` performs the following conceptual steps:
+
+        1. fp32_loss = loss.float() (see first Note below)
+        2. scaled_loss = fp32_loss*loss_scale
+        3. scaled_loss.backward(), which accumulates scaled gradients into the ``.grad`` attributes of the model's leaves (which may be fp16, fp32, or a mixture, depending how your model was defined).
+        4. fp16 grads are then copied to the master params' ``.grad`` attributes (see second Note), which are guaranteed to be fp32.
+        5. Finally, master grads are divided by loss_scale.
+
+        In this way, after :attr:`backward`, the master params have fresh gradients,
+        and :attr:`step` may be called.
+
+        .. note::
+            :attr:`backward` internally converts the loss to fp32 before applying the loss scale.
+            This provides some additional safety against overflow if the user has supplied an
+            fp16 loss value.
+            However, for maximum overflow safety, the user should
+            compute the loss criterion (MSE, cross entropy, etc) in fp32 before supplying it to
+            :attr:`backward`.
+
+        .. warning::
+            The gradients found in a model's leaves after the call to
+            :attr:`backward` should not be regarded as valid in general,
+            because it's possible
+            they have been scaled (and in the case of dynamic loss scaling,
+            the scale factor may change over time).
+            If the user wants to inspect gradients after a call to :attr:`backward`,
+            only the master gradients should be regarded as valid.  These can be retrieved via
+            :attr:`inspect_master_grad_data()`.
+
+        Args:
+            loss:  The loss output by the user's model.  loss may be either float or half (but see first Note above).
+            update_master_grads (bool, optional, default=True):  Option to copy fp16 grads to fp32 grads on this call.  By setting this to False, the user can delay the copy, which is useful to eliminate redundant fp16->fp32 grad copies if :attr:`backward` is being called on multiple losses in one iteration.  If set to False, the user becomes responsible for calling :attr:`update_master_grads` before calling :attr:`step`.
+            retain_graph (bool, optional, default=False):  Forwards the usual ``retain_graph=True`` option to the internal call to ``loss.backward``.  If ``retain_graph`` is being used to accumulate gradient values from multiple backward passes before calling ``optimizer.step``, passing ``update_master_grads=False`` is also recommended (see Example below).
+
+        Example::
+
+            # Ordinary operation:
+            optimizer.backward(loss)
+
+            # Naive operation with multiple losses (technically valid, but less efficient):
+            # fp32 grads will be correct after the second call,  but
+            # the first call incurs an unnecessary fp16->fp32 grad copy.
+            optimizer.backward(loss1)
+            optimizer.backward(loss2)
+
+            # More efficient way to handle multiple losses:
+            # The fp16->fp32 grad copy is delayed until fp16 grads from all
+            # losses have been accumulated.
+            optimizer.backward(loss1, update_master_grads=False)
+            optimizer.backward(loss2, update_master_grads=False)
+            optimizer.update_master_grads()
+        """ # noqa
+        # To consider:  try multiple backward passes using retain_grad=True to find
+        # a loss scale that works.  After you find a loss scale that works, do a final dummy
+        # backward pass with retain_graph=False to tear down the graph.  Doing this would avoid
+        # discarding the iteration,  but probably wouldn't improve overall efficiency.
+        self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
+        if update_master_grads:
+            self.update_master_grads()
+
+    def update_master_grads(self):
+        """
+        Copy the ``.grad`` attribute from stored references to fp16 parameters to
+        the ``.grad`` attribute of the fp32 master parameters that are directly
+        updated by the optimizer.  :attr:`update_master_grads` only needs to be called if
+        ``fp16_optimizer_obj.backward`` was called with ``update_master_grads=False``.
+        """ # noqa
+        if self.dynamic_loss_scale:
+            self._check_overflow()
+            if self.overflow: return  # noqa
+        self._model_grads_to_master_grads()
+        self._downscale_master()
+
+    def inspect_master_grad_data(self):
+        """
+        When running with :class:`FP16_Optimizer`,
+        ``.grad`` attributes of a model's fp16 leaves should not be
+        regarded as truthful, because they might be scaled.
+        After a call to :attr:`fp16_optimizer_obj.backward(loss)`, if no overflow was encountered,
+        the fp32 master params' ``.grad``
+        attributes will contain valid gradients properly divided by the loss scale.  However,
+        because :class:`FP16_Optimizer` flattens some parameters, accessing them may be
+        nonintuitive.  :attr:`inspect_master_grad_data`
+        allows those gradients to be viewed with shapes corresponding to their associated model leaves.
+
+        Returns:
+            List of lists (one list for each parameter group).  The list for each parameter group
+            is a list of the ``.grad.data`` attributes of the fp32 master params belonging to that group.
+        """
+        if self.overflow:
+            print(
+                'Warning:  calling FP16_Optimizer.inspect_master_grad_data while in an overflow state.  '
+                'Gradients are currently invalid (may be inf, nan, or stale).  Returning None.'
+            )
+            return None
+        else:
+            # The optimizer owns only references to master params.
+            master_grads_data = []
+            for param_group in self.optimizer.param_groups:
+                master_grads_this_group = []
+                for param in param_group['params']:
+                    if param.grad is not None:
+                        master_grads_this_group.append(param.grad.data)
+                    else:
+                        master_grads_this_group.append(None)
+                master_grads_data.append(master_grads_this_group)
+            return master_grads_data
+
+    # Promote loss scale so it can be retrieved or set via "fp16_optimizer_instance.loss_scale"
+    def _get_loss_scale(self):
+        return self.loss_scaler.loss_scale
+
+    def _set_loss_scale(self, value):
+        self.loss_scaler.cur_scale = value
+
+    loss_scale = property(_get_loss_scale, _set_loss_scale)
+
+    # Promote state so it can be retrieved or set via "fp16_optimizer_instance.state"
+    def _get_state(self):
+        return self.optimizer.state
+
+    def _set_state(self, value):
+        self.optimizer.state = value
+
+    state = property(_get_state, _set_state)
+
+    # Promote param_groups so it can be retrieved or set via "fp16_optimizer_instance.param_groups"
+    # (for example, to adjust the learning rate)
+    def _get_param_groups(self):
+        return self.optimizer.param_groups
+
+    def _set_param_groups(self, value):
+        self.optimizer.param_groups = value
+
+    param_groups = property(_get_param_groups, _set_param_groups)
diff --git a/modelscope/models/nlp/mglm/fp16/fp16util.py b/modelscope/models/nlp/mglm/fp16/fp16util.py
new file mode 100644
index 00000000..3fcd3005
--- /dev/null
+++ b/modelscope/models/nlp/mglm/fp16/fp16util.py
@@ -0,0 +1,220 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+from torch.autograd import Variable
+
+from modelscope.models.nlp.mglm import mpu
+
+
+class tofp16(nn.Module):
+    """
+    Utility module that implements::
+
+        def forward(self, input):
+            return input.half()
+    """
+
+    def __init__(self):
+        super(tofp16, self).__init__()
+
+    def forward(self, input):
+        return input.half()
+
+
+def BN_convert_float(module):
+    """
+    Utility function for network_to_half().
+
+    Retained for legacy purposes.
+    """
+    if isinstance(
+            module,
+            torch.nn.modules.batchnorm._BatchNorm) and module.affine is True:
+        module.float()
+    for child in module.children():
+        BN_convert_float(child)
+    return module
+
+
+def network_to_half(network):
+    """
+    Convert model to half precision in a batchnorm-safe way.
+
+    Retained for legacy purposes. It is recommended to use FP16Model.
+    """
+    return nn.Sequential(tofp16(), BN_convert_float(network.half()))
+
+
+def convert_module(module, dtype):
+    """
+    Converts a module's immediate parameters and buffers to dtype.
+    """
+    for param in module.parameters(recurse=False):
+        if param is not None:
+            if param.data.dtype.is_floating_point:
+                param.data = param.data.to(dtype=dtype)
+            if param._grad is not None and param._grad.data.dtype.is_floating_point:
+                param._grad.data = param._grad.data.to(dtype=dtype)
+
+    for buf in module.buffers(recurse=False):
+        if buf is not None and buf.data.dtype.is_floating_point:
+            buf.data = buf.data.to(dtype=dtype)
+
+
+def convert_network(network, dtype):
+    """
+    Converts a network's parameters and buffers to dtype.
+    """
+    for module in network.modules():
+        if isinstance(module, torch.nn.modules.batchnorm._BatchNorm
+                      ) and module.affine is True:
+            continue
+        convert_module(module, dtype)
+    return network
+
+
+class FP16Model(nn.Module):
+    """
+    Convert model to half precision in a batchnorm-safe way.
+    """
+
+    def __init__(self, network):
+        super(FP16Model, self).__init__()
+        self.network = convert_network(network, dtype=torch.half)
+
+    def forward(self, *inputs):
+        inputs = tuple(t.half() for t in inputs)
+        return self.network(*inputs)
+
+
+def backwards_debug_hook(grad):
+    raise RuntimeError(
+        'master_params recieved a gradient in the backward pass!')
+
+
+def prep_param_lists(model, flat_master=False):
+    """
+    Creates a list of FP32 master parameters for a given model, as in
+    `Training Neural Networks with Mixed Precision:  Real Examples`_.
+
+    Args:
+        model (torch.nn.Module): Existing Pytorch model
+        flat_master (bool, optional, default=False):  Flatten the master parameters into a single tensor, as a performance optimization.
+    Returns:
+        A tuple (``model_params``, ``master_params``). ``model_params`` is a list of the model's parameters for later use with :func:`model_grads_to_master_grads` and :func:`master_params_to_model_params`.  ``master_params`` is a list of FP32 master gradients.  If ``flat_master=True``, ``master_params`` will be a list with one element.
+
+    Example::
+
+        model_params, master_params = prep_param_lists(model)
+
+    .. warning::
+        Currently, if ``flat_master=True``, all the model's parameters must be the same type.  If the model has parameters of different types, use ``flat_master=False``, or use :class:`FP16_Optimizer`.
+
+    .. _`Training Neural Networks with Mixed Precision:  Real Examples`:
+        http://on-demand.gputechconf.com/gtc/2018/video/S81012/
+    """ # noqa
+    model_params = [
+        param for param in model.parameters() if param.requires_grad
+    ]
+
+    if flat_master:
+        # Give the user some more useful error messages
+        try:
+            # flatten_dense_tensors returns a contiguous flat array.
+            # http://pytorch.org/docs/master/_modules/torch/_utils.html
+            master_params = _flatten_dense_tensors(
+                [param.data for param in model_params]).float()
+        except:  # noqa
+            print(
+                'Error in prep_param_lists:  model may contain a mixture of parameters '
+                'of different types.  Use flat_master=False, or use F16_Optimizer.'
+            )
+            raise
+        master_params = torch.nn.Parameter(master_params)
+        master_params.requires_grad = True
+        # master_params.register_hook(backwards_debug_hook)
+        if master_params.grad is None:
+            master_params.grad = master_params.new(*master_params.size())
+        return model_params, [master_params]
+    else:
+        master_params = [
+            param.clone().float().detach() for param in model_params
+        ]
+        for param in master_params:
+            param.requires_grad = True
+        return model_params, master_params
+
+
+def model_grads_to_master_grads(model_params,
+                                master_params,
+                                flat_master=False):
+    """
+    Copy model gradients to master gradients.
+
+    Args:
+        model_params:  List of model parameters created by :func:`prep_param_lists`.
+        master_params:  List of FP32 master parameters created by :func:`prep_param_lists`.  If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`model_grads_to_master_grads`.
+    """ # noqa
+    if flat_master:
+        # The flattening may incur one more deep copy than is necessary.
+        master_params[0].grad.data.copy_(
+            _flatten_dense_tensors([p.grad.data for p in model_params]))
+    else:
+        for model, master in zip(model_params, master_params):
+            if model.grad is not None:
+                if master.grad is None:
+                    master.grad = Variable(
+                        master.data.new(*master.data.size()))
+                master.grad.data.copy_(model.grad.data)
+            else:
+                master.grad = None
+
+
+def master_params_to_model_params(model_params,
+                                  master_params,
+                                  flat_master=False):
+    """
+    Copy master parameters to model parameters.
+
+    Args:
+        model_params:  List of model parameters created by :func:`prep_param_lists`.
+        master_params:  List of FP32 master parameters created by :func:`prep_param_lists`.  If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`master_params_to_model_params`.
+    """ # noqa
+    if flat_master:
+        for model, master in zip(
+                model_params,
+                _unflatten_dense_tensors(master_params[0].data, model_params)):
+            model.data.copy_(master)
+    else:
+        for model, master in zip(model_params, master_params):
+            model.data.copy_(master.data)
+
+
+# Backward compatibility fixes
+
+
+def to_python_float(t):
+    if hasattr(t, 'item'):
+        return t.item()
+    else:
+        return t[0]
+
+
+TORCH_MAJOR = int(torch.__version__.split('.')[0])
+TORCH_MINOR = int(torch.__version__.split('.')[1])
+
+clip_grad_norm = mpu.clip_grad_norm
diff --git a/modelscope/models/nlp/mglm/fp16/loss_scaler.py b/modelscope/models/nlp/mglm/fp16/loss_scaler.py
new file mode 100755
index 00000000..721571b3
--- /dev/null
+++ b/modelscope/models/nlp/mglm/fp16/loss_scaler.py
@@ -0,0 +1,245 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from modelscope.models.nlp.mglm import mpu
+
+
+# item() is a recent addition, so this helps with backward compatibility.
+def to_python_float(t):
+    if hasattr(t, 'item'):
+        return t.item()
+    else:
+        return t[0]
+
+
+class LossScaler:
+    """
+    Class that manages a static loss scale.  This class is intended to interact with
+    :class:`FP16_Optimizer`, and should not be directly manipulated by the user.
+
+    Use of :class:`LossScaler` is enabled via the ``static_loss_scale`` argument to
+    :class:`FP16_Optimizer`'s constructor.
+
+    Args:
+        scale (float, optional, default=1.0):  The loss scale.
+    """
+
+    def __init__(self, scale=1):
+        self.cur_scale = scale
+
+    # `params` is a list / generator of torch.Variable
+    def has_overflow(self, params):
+        return False
+
+    # `x` is a torch.Tensor
+    def _has_inf_or_nan(x):
+        return False
+
+    def update_scale(self, overflow):
+        pass
+
+    @property
+    def loss_scale(self):
+        return self.cur_scale
+
+    def scale_gradient(self, module, grad_in, grad_out):
+        return tuple(self.loss_scale * g for g in grad_in)
+
+    def backward(self, loss, retain_graph=False):
+        scaled_loss = loss * self.loss_scale
+        scaled_loss.backward(retain_graph=retain_graph)
+
+
+class DynamicLossScaler:
+    """
+    Class that manages dynamic loss scaling.  It is recommended to use :class:`DynamicLossScaler`
+    indirectly, by supplying ``dynamic_loss_scale=True`` to the constructor of
+    :class:`FP16_Optimizer`.  However, it's important to understand how :class:`DynamicLossScaler`
+    operates, because the default options can be changed using the
+    the ``dynamic_loss_args`` argument to :class:`FP16_Optimizer`'s constructor.
+
+    Loss scaling is designed to combat the problem of underflowing gradients encountered at long
+    times when training fp16 networks.  Dynamic loss scaling begins by attempting a very high loss
+    scale.  Ironically, this may result in OVERflowing gradients.  If overflowing gradients are
+    encountered, :class:`DynamicLossScaler` informs :class:`FP16_Optimizer` that an overflow has
+    occurred.
+    :class:`FP16_Optimizer` then skips the update step for this particular iteration/minibatch,
+    and :class:`DynamicLossScaler` adjusts the loss scale to a lower value.
+    If a certain number of iterations occur without overflowing gradients detected,
+    :class:`DynamicLossScaler` increases the loss scale once more.
+    In this way :class:`DynamicLossScaler` attempts to "ride the edge" of
+    always using the highest loss scale possible without incurring overflow.
+
+    Args:
+        init_scale (float, optional, default=2**32):  Initial loss scale attempted by :class:`DynamicLossScaler.`
+        scale_factor (float, optional, default=2.0):  Factor used when adjusting the loss scale. If an overflow is encountered, the loss scale is readjusted to loss scale/``scale_factor``.  If ``scale_window`` consecutive iterations take place without an overflow, the loss scale is readjusted to loss_scale*``scale_factor``.
+        scale_window (int, optional, default=1000):  Number of consecutive iterations without an overflow to wait before increasing the loss scale.
+    """ # noqa
+
+    def __init__(self,
+                 init_scale=2**32,
+                 scale_factor=2.,
+                 scale_window=1000,
+                 min_scale=1,
+                 delayed_shift=1,
+                 consecutive_hysteresis=False):
+        self.cur_scale = init_scale
+        self.cur_iter = 0
+        self.last_overflow_iter = -1
+        self.scale_factor = scale_factor
+        self.scale_window = scale_window
+        self.min_scale = min_scale
+        self.delayed_shift = delayed_shift
+        self.cur_hysteresis = delayed_shift
+        self.consecutive_hysteresis = consecutive_hysteresis
+
+    # `params` is a list / generator of torch.Variable
+    def has_overflow_serial(self, params):
+        for p in params:
+            if p.grad is not None and DynamicLossScaler._has_inf_or_nan(
+                    p.grad.data):
+                return True
+
+        return False
+
+    def has_overflow(self, params):
+        overflow = self.has_overflow_serial(params)
+        # Since each model parallel GPU carries only part of the model,
+        # make sure overflow flag is synced across all the model parallel GPUs
+        overflow_gpu = torch.cuda.ByteTensor([overflow])
+        torch.distributed.all_reduce(
+            overflow_gpu,
+            op=torch.distributed.ReduceOp.MAX,
+            group=mpu.get_model_parallel_group())
+        overflow = overflow_gpu[0].item()
+        return bool(overflow)
+
+    # `x` is a torch.Tensor
+    def _has_inf_or_nan(x):
+        try:
+            # if x is half, the .float() incurs an additional deep copy, but it's necessary if
+            # Pytorch's .sum() creates a one-element tensor of the same type as x
+            # (which is true for some recent version of pytorch).
+            cpu_sum = float(x.float().sum())
+            # More efficient version that can be used if .sum() returns a Python scalar
+            # cpu_sum = float(x.sum())
+        except RuntimeError as instance:
+            # We want to check if inst is actually an overflow exception.
+            # RuntimeError could come from a different error.
+            # If so, we still want the exception to propagate.
+            if 'value cannot be converted' not in instance.args[0]:
+                raise
+            return True
+        else:
+            if cpu_sum == float(
+                    'inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
+                return True
+            return False
+
+    # `overflow` is boolean indicating whether the gradient overflowed
+    def update_scale(self, overflow):
+
+        if not hasattr(self, 'min_scale'):
+            self.min_scale = 1
+        if not hasattr(self, 'delayed_shift'):
+            self.delayed_shift = 1
+        if not hasattr(self, 'cur_hysteresis'):
+            self.cur_hysteresis = 1
+        if not hasattr(self, 'consecutive_hysteresis'):
+            self.consecutive_hysteresis = True
+        if overflow:
+            # self.cur_scale /= self.scale_factor
+            if self.delayed_shift == 1 or self.cur_hysteresis == 1:
+                self.cur_scale = max(self.cur_scale / self.scale_factor,
+                                     self.min_scale)
+            else:
+                self.cur_hysteresis -= 1
+            self.last_overflow_iter = self.cur_iter
+        else:
+            if self.consecutive_hysteresis:
+                self.cur_hysteresis = self.delayed_shift
+            if (self.cur_iter
+                    - self.last_overflow_iter) % self.scale_window == 0:
+                if not self.consecutive_hysteresis:
+                    self.cur_hysteresis = self.delayed_shift
+                self.cur_scale *= self.scale_factor
+        self.cur_iter += 1
+
+    @property
+    def loss_scale(self):
+        return self.cur_scale
+
+    def scale_gradient(self, module, grad_in, grad_out):
+        return tuple(self.loss_scale * g for g in grad_in)
+
+    def backward(self, loss, retain_graph=False):
+        scaled_loss = loss * self.loss_scale
+        scaled_loss.backward(retain_graph=retain_graph)
+
+
+##############################################################
+# Example usage below here -- assuming it's in a separate file
+##############################################################
+"""
+TO-DO separate out into an example.
+if __name__ == "__main__":
+    import torch
+    from torch.autograd import Variable
+    from dynamic_loss_scaler import DynamicLossScaler
+
+    # N is batch size; D_in is input dimension;
+    # H is hidden dimension; D_out is output dimension.
+    N, D_in, H, D_out = 64, 1000, 100, 10
+
+    # Create random Tensors to hold inputs and outputs, and wrap them in Variables.
+    x = Variable(torch.randn(N, D_in), requires_grad=False)
+    y = Variable(torch.randn(N, D_out), requires_grad=False)
+
+    w1 = Variable(torch.randn(D_in, H), requires_grad=True)
+    w2 = Variable(torch.randn(H, D_out), requires_grad=True)
+    parameters = [w1, w2]
+
+    learning_rate = 1e-6
+    optimizer = torch.optim.SGD(parameters, lr=learning_rate)
+    loss_scaler = DynamicLossScaler()
+
+    for t in range(500):
+        y_pred = x.mm(w1).clamp(min=0).mm(w2)
+        loss = (y_pred - y).pow(2).sum() * loss_scaler.loss_scale
+        print('Iter {} loss scale: {}'.format(t, loss_scaler.loss_scale))
+        print('Iter {} scaled loss: {}'.format(t, loss.data[0]))
+        print('Iter {} unscaled loss: {}'.format(t, loss.data[0] / loss_scaler.loss_scale))
+
+        # Run backprop
+        optimizer.zero_grad()
+        loss.backward()
+
+        # Check for overflow
+        has_overflow = DynamicLossScaler.has_overflow(parameters)
+
+        # If no overflow, unscale grad and update as usual
+        if not has_overflow:
+            for param in parameters:
+                param.grad.data.mul_(1. / loss_scaler.loss_scale)
+            optimizer.step()
+        # Otherwise, don't do anything -- ie, skip iteration
+        else:
+            print('OVERFLOW!')
+
+        # Update loss scale for next iteration
+        loss_scaler.update_scale(has_overflow)
+
+"""
diff --git a/modelscope/models/nlp/mglm/generation_utils.py b/modelscope/models/nlp/mglm/generation_utils.py
new file mode 100644
index 00000000..6db75b2d
--- /dev/null
+++ b/modelscope/models/nlp/mglm/generation_utils.py
@@ -0,0 +1,483 @@
+# Copyright 2020 The HuggingFace Inc. team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+from collections import UserDict
+from typing import Iterable, List, Optional, Tuple
+
+import torch
+
+PROCESS_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size * num_beams, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using any class inheriting from :class:`~transformers.PretrainedTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+        next_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2 * num_beams)`):
+            Current scores of the top :obj:`2 * num_beams` non-finished beam hypotheses.
+        next_tokens (:obj:`torch.LongTensor` of shape :obj:`(batch_size, 2 * num_beams)`):
+            :obj:`input_ids` of the tokens corresponding to the top :obj:`2 * num_beams` non-finished beam hypotheses.
+        next_indices (:obj:`torch.LongTensor` of shape :obj:`(batch_size, 2 * num_beams)`):
+            Beam indices indicating to which beam hypothesis the :obj:`next_tokens` correspond.
+        pad_token_id (:obj:`int`, `optional`):
+            The id of the `padding` token.
+        eos_token_id (:obj:`int`, `optional`):
+            The id of the `end-of-sequence` token.
+
+    Return:
+        :obj:`UserDict`: A dictionary composed of the fields as defined above:
+
+            - **next_beam_scores** (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`) -- Updated
+              scores of all non-finished beams.
+            - **next_beam_tokens** (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`) -- Next tokens
+              to be added to the non-finished beam_hypotheses.
+            - **next_beam_indices** (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`) -- Beam indices
+              indicating to which beam the next tokens shall be added.
+
+"""
+
+FINALIZE_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size * num_beams, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using any class inheriting from :class:`~transformers.PretrainedTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+        final_beam_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`):
+            The final scores of all non-finished beams.
+        final_beam_tokens (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`):
+            The last tokens to be added to the non-finished beam_hypotheses.
+        final_beam_indices (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`):
+            The beam indices indicating to which beam the :obj:`final_beam_tokens` shall be added.
+        pad_token_id (:obj:`int`, `optional`):
+            The id of the `padding` token.
+        eos_token_id (:obj:`int`, `optional`):
+            The id of the `end-of-sequence` token.
+
+    Return:
+        :obj:`torch.LongTensor` of shape :obj:`(batch_size * num_return_sequences, sequence_length)`: The generated
+        sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or shorter if all
+        batches finished early due to the :obj:`eos_token_id`.
+
+"""
+
+
+class BeamScorer(ABC):
+    """
+    Abstract base class for all beam scorers that are used for :meth:`~transformers.PretrainedModel.beam_search` and
+    :meth:`~transformers.PretrainedModel.beam_sample`.
+    """
+
+    @abstractmethod
+    def process(self, input_ids: torch.LongTensor,
+                next_scores: torch.FloatTensor, next_tokens: torch.LongTensor,
+                next_indices: torch.LongTensor,
+                **kwargs) -> Tuple[torch.Tensor]:
+        raise NotImplementedError('This is an abstract method.')
+
+    @abstractmethod
+    def finalize(self, input_ids: torch.LongTensor,
+                 next_scores: torch.FloatTensor, next_tokens: torch.LongTensor,
+                 next_indices: torch.LongTensor, **kwargs) -> torch.LongTensor:
+        raise NotImplementedError('This is an abstract method.')
+
+
+class BeamSearchScorer(BeamScorer):
+    r"""
+    :class:`transformers.BeamScorer` implementing standard beam search decoding.
+
+    Adapted in part from `Facebook's XLM beam search code
+    <https://github.com/facebookresearch/XLM/blob/9e6f6814d17be4fe5b15f2e6c43eb2b2d76daeb4/src/model/transformer.py#L529>`__.
+
+    Args:
+        batch_size (:obj:`int`):
+            Batch Size of :obj:`input_ids` for which beam search decoding is run in parallel.
+        max_length (:obj:`int`):
+            The maximum length of the sequence to be generated.
+        num_beams (:obj:`int`):
+            Number of beams for beam search.
+        device (:obj:`torch.device`):
+            Defines the device type (*e.g.*, :obj:`"cpu"` or :obj:`"cuda"`) on which this instance of
+            :obj:`BeamSearchScorer` will be allocated.
+        length_penalty (:obj:`float`, `optional`, defaults to 1.0):
+            Exponential penalty to the length. 1.0 means no penalty. Set to values < 1.0 in order to encourage the
+            model to generate shorter sequences, to a value > 1.0 in order to encourage the model to produce longer
+            sequences.
+        do_early_stopping (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether to stop the beam search when at least ``num_beams`` sentences are finished per batch or not.
+        num_beam_hyps_to_keep (:obj:`int`, `optional`, defaults to 1):
+            The number of beam hypotheses that shall be returned upon calling
+            :meth:`~transformer.BeamSearchScorer.finalize`.
+    """
+
+    def __init__(
+        self,
+        batch_size: int,
+        max_length: int,
+        num_beams: int,
+        device: torch.device,
+        length_penalty: Optional[float] = 1.0,
+        do_early_stopping: Optional[bool] = False,
+        num_beam_hyps_to_keep: Optional[int] = 1,
+    ):
+        self.max_length = max_length
+        self.num_beams = num_beams
+        self.device = device
+        self.length_penalty = length_penalty
+        self.do_early_stopping = do_early_stopping
+        self.num_beam_hyps_to_keep = num_beam_hyps_to_keep
+
+        self._is_init = False
+        self._beam_hyps = [
+            BeamHypotheses(
+                num_beams=self.num_beams,
+                max_length=self.max_length,
+                length_penalty=self.length_penalty,
+                early_stopping=self.do_early_stopping,
+            ) for _ in range(batch_size)
+        ]
+        self._done = torch.tensor([False for _ in range(batch_size)],
+                                  dtype=torch.bool,
+                                  device=self.device)
+
+        # if not isinstance(num_beams, int) or num_beams <= 1:
+        #     raise ValueError(
+        #     )
+
+    @property
+    def is_done(self) -> bool:
+        return self._done.all()
+
+    def process(self,
+                input_ids: torch.LongTensor,
+                next_scores: torch.FloatTensor,
+                next_tokens: torch.LongTensor,
+                next_indices: torch.LongTensor,
+                pad_token_id: Optional[int] = None,
+                eos_token_id: Optional[int] = None,
+                mems=None) -> Tuple[torch.Tensor]:
+        cur_len = input_ids.shape[-1]
+        batch_size = len(self._beam_hyps)
+        assert batch_size == (input_ids.shape[0] // self.num_beams)
+        if isinstance(eos_token_id, int):
+            eos_token_id = [eos_token_id]
+        device = next_scores.device
+        next_beam_scores = torch.zeros((batch_size, self.num_beams),
+                                       dtype=next_scores.dtype,
+                                       device=device)
+        next_beam_tokens = torch.zeros((batch_size, self.num_beams),
+                                       dtype=next_tokens.dtype,
+                                       device=device)
+        next_beam_indices = torch.zeros((batch_size, self.num_beams),
+                                        dtype=next_indices.dtype,
+                                        device=device)
+
+        for batch_idx, beam_hyp in enumerate(self._beam_hyps):
+            if self._done[batch_idx]:
+                assert (
+                    len(beam_hyp) >= self.num_beams
+                ), 'Batch can only be done if at least {} beams have been generated'.format(
+                    self.num_beams)
+                assert (
+                    eos_token_id is not None and pad_token_id is not None
+                ), 'generated beams >= num_beams -> eos_token_id and pad_token have to be defined'
+                # pad the batch
+                next_beam_scores[batch_idx, :] = 0
+                next_beam_tokens[batch_idx, :] = pad_token_id
+                next_beam_indices[batch_idx, :] = 0
+                continue
+
+            # next tokens for this sentence
+            beam_idx = 0
+            for beam_token_rank, (next_token, next_score,
+                                  next_index) in enumerate(
+                                      zip(next_tokens[batch_idx],
+                                          next_scores[batch_idx],
+                                          next_indices[batch_idx])):
+                batch_beam_idx = batch_idx * self.num_beams + next_index
+                # add to generated hypotheses if end of sentence
+                if (eos_token_id is not None) and (next_token.item()
+                                                   in eos_token_id):
+                    # if beam_token does not belong to top num_beams tokens, it should not be added
+                    is_beam_token_worse_than_top_num_beams = beam_token_rank >= self.num_beams
+                    if is_beam_token_worse_than_top_num_beams:
+                        continue
+                    beam_hyp.add(
+                        input_ids[batch_beam_idx].clone(),
+                        next_score.item(),
+                        mems=[mem[[next_index.item()]]
+                              for mem in mems] if mems else None)
+                else:
+                    # add next predicted token since it is not eos_token
+                    next_beam_scores[batch_idx, beam_idx] = next_score
+                    next_beam_tokens[batch_idx, beam_idx] = next_token
+                    next_beam_indices[batch_idx, beam_idx] = batch_beam_idx
+                    beam_idx += 1
+
+                # once the beam for next step is full, don't add more tokens to it.
+                if beam_idx == self.num_beams:
+                    break
+
+            if beam_idx < self.num_beams:
+                raise ValueError(
+                    f'At most {self.num_beams} tokens in {next_tokens[batch_idx]} can be equal to `eos_token_id: {eos_token_id}`. Make sure {next_tokens[batch_idx]} are corrected.'  # noqa
+                )  # noqa
+
+            # Check if we are done so that we can save a pad step if all(done)
+            self._done[batch_idx] = self._done[batch_idx] or beam_hyp.is_done(
+                next_scores[batch_idx].max().item(), cur_len)
+
+        return UserDict({
+            'next_beam_scores': next_beam_scores.view(-1),
+            'next_beam_tokens': next_beam_tokens.view(-1),
+            'next_beam_indices': next_beam_indices.view(-1),
+        })
+
+    def finalize(self,
+                 input_ids: torch.LongTensor,
+                 final_beam_scores: torch.FloatTensor,
+                 final_beam_tokens: torch.LongTensor,
+                 final_beam_indices: torch.LongTensor,
+                 pad_token_id: Optional[int] = None,
+                 eos_token_id: Optional[int] = None,
+                 mems=None) -> Tuple[torch.LongTensor, List[torch.Tensor]]:
+        batch_size = len(self._beam_hyps)
+
+        # finalize all open beam hypotheses and add to generated hypotheses
+        for batch_idx, beam_hyp in enumerate(self._beam_hyps):
+            if self._done[batch_idx]:
+                continue
+
+            # need to add best num_beams hypotheses to generated hyps
+            for beam_id in range(self.num_beams):
+                batch_beam_idx = batch_idx * self.num_beams + beam_id
+                final_score = final_beam_scores[batch_beam_idx].item()
+                final_tokens = input_ids[batch_beam_idx]
+                beam_hyp.add(
+                    final_tokens,
+                    final_score,
+                    mems=[mem[[batch_beam_idx]]
+                          for mem in mems] if mems else None)
+
+        # select the best hypotheses
+        sent_lengths = input_ids.new(batch_size * self.num_beam_hyps_to_keep)
+        best = []
+
+        # retrieve best hypotheses
+        for i, beam_hyp in enumerate(self._beam_hyps):
+            sorted_hyps = sorted(beam_hyp.beams, key=lambda x: x[0])
+            for j in range(self.num_beam_hyps_to_keep):
+                best_hyp, mems = sorted_hyps.pop()[1:]
+                sent_lengths[self.num_beam_hyps_to_keep * i
+                             + j] = len(best_hyp)
+                best.append((best_hyp, mems))
+
+        # prepare for adding eos
+        sent_max_len = min(sent_lengths.max().item(), self.max_length)
+        decoded: torch.LongTensor = input_ids.new(
+            batch_size * self.num_beam_hyps_to_keep, sent_max_len)
+        # shorter batches are padded if needed
+        if sent_lengths.min().item() != sent_lengths.max().item():
+            assert pad_token_id is not None, '`pad_token_id` has to be defined'
+            decoded.fill_(pad_token_id)
+
+        # fill with hypotheses and eos_token_id if the latter fits in
+        mems = []
+        for i, (hypo, mem) in enumerate(best):
+            decoded[i, :sent_lengths[i]] = hypo
+            if sent_lengths[i] < sent_max_len:
+                decoded[i, sent_lengths[i]] = eos_token_id
+            mems.append(mem)
+        mems = [
+            torch.cat([mem[i] for mem in mems], dim=0)
+            for i in range(len(mems[0]))
+        ] if mems and mems[0] else None
+        return decoded, mems
+
+
+class BeamHypotheses:
+
+    def __init__(self, num_beams: int, max_length: int, length_penalty: float,
+                 early_stopping: bool):
+        """
+        Initialize n-best list of hypotheses.
+        """
+        self.max_length = max_length - 1  # ignoring bos_token
+        self.length_penalty = length_penalty
+        self.early_stopping = early_stopping
+        self.num_beams = num_beams
+        self.beams = []
+        self.worst_score = 1e9
+
+    def __len__(self):
+        """
+        Number of hypotheses in the list.
+        """
+        return len(self.beams)
+
+    def add(self, hyp: torch.LongTensor, sum_logprobs: float, mems=None):
+        """
+        Add a new hypothesis to the list.
+        """
+        score = sum_logprobs / (max(hyp.shape[-1], 1)**self.length_penalty)
+        if len(self) < self.num_beams or score > self.worst_score:
+            self.beams.append((score, hyp, mems))
+            if len(self) > self.num_beams:
+                sorted_next_scores = sorted([
+                    (s, idx) for idx, (s, _, _) in enumerate(self.beams)
+                ])
+                del self.beams[sorted_next_scores[0][1]]
+                self.worst_score = sorted_next_scores[1][0]
+            else:
+                self.worst_score = min(score, self.worst_score)
+
+    def is_done(self, best_sum_logprobs: float, cur_len: int) -> bool:
+        """
+        If there are enough hypotheses and that none of the hypotheses being generated can become better than the worst
+        one in the heap, then we are done with this sentence.
+        """
+
+        if len(self) < self.num_beams:
+            return False
+        elif self.early_stopping:
+            return True
+        else:
+            cur_score = best_sum_logprobs / cur_len**self.length_penalty
+            ret = self.worst_score >= cur_score
+            return ret
+
+
+class LogitsProcessor(ABC):
+    """Abstract base class for all logit processors that can be applied during generation."""
+
+    def __call__(self, input_ids: torch.LongTensor,
+                 scores: torch.FloatTensor) -> torch.FloatTensor:
+        """Torch method for processing logits."""
+        raise NotImplementedError(
+            f'{self.__class__} is an abstract class. Only classes inheriting this class can be called.'
+        )
+
+
+class LogitsProcessorList(list):
+    """
+    This class can be used to create a list of :class:`~transformers.LogitsProcessor` or
+    :class:`~transformers.LogitsWarper` to subsequently process a :obj:`scores` input tensor. This class inherits from
+    list and adds a specific `__call__` method to apply each :class:`~transformers.LogitsProcessor` or
+    :class:`~transformers.LogitsProcessor` to the inputs.
+    """
+
+    def __call__(self, input_ids: torch.LongTensor,
+                 scores: torch.FloatTensor) -> torch.FloatTensor:
+        for processor in self:
+            scores = processor(input_ids, scores)
+        return scores
+
+
+class MinLengthLogitsProcessor(LogitsProcessor):
+    r"""
+    :class:`transformers.LogitsProcessor` enforcing a min-length by setting EOS probability to 0.
+
+    Args:
+        min_length (:obj:`int`):
+            The minimum length below which the score of :obj:`eos_token_id` is set to :obj:`-float("Inf")`.
+        eos_token_id (:obj:`int`):
+            The id of the `end-of-sequence` token.
+    """
+
+    def __init__(self, min_length: int, eos_token_id: int):
+        if not isinstance(min_length, int) or min_length < 0:
+            raise ValueError(
+                f'`min_length` has to be a positive integer, but is {min_length}'
+            )
+
+        if not isinstance(eos_token_id, int) or eos_token_id < 0:
+            raise ValueError(
+                f'`eos_token_id` has to be a positive integer, but is {eos_token_id}'
+            )
+
+        self.min_length = min_length
+        self.eos_token_id = eos_token_id
+
+    def __call__(self, input_ids: torch.LongTensor,
+                 scores: torch.FloatTensor) -> torch.FloatTensor:
+        cur_len = input_ids.shape[-1]
+        if cur_len < self.min_length:
+            scores[:, self.eos_token_id] = -float('inf')
+        return scores
+
+
+class NoRepeatNGramLogitsProcessor(LogitsProcessor):
+    r"""
+    :class:`transformers.LogitsProcessor` that enforces no repetition of n-grams. See `Fairseq
+    <https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345>`__.
+
+    Args:
+        ngram_size (:obj:`int`):
+            All ngrams of size :obj:`ngram_size` can only occur once.
+    """
+
+    def __init__(self, ngram_size: int):
+        if not isinstance(ngram_size, int) or ngram_size <= 0:
+            raise ValueError(
+                f'`ngram_size` has to be a strictly positive integer, but is {ngram_size}'
+            )
+        self.ngram_size = ngram_size
+
+    def __call__(self, input_ids: torch.LongTensor,
+                 scores: torch.FloatTensor) -> torch.FloatTensor:
+        num_batch_hypotheses = scores.shape[0]
+        cur_len = input_ids.shape[-1]
+        banned_batch_tokens = self._calc_banned_ngram_tokens(
+            input_ids, num_batch_hypotheses, cur_len)
+
+        for i, banned_tokens in enumerate(banned_batch_tokens):
+            scores[i, banned_tokens] = -float('inf')
+
+        return scores
+
+    def _calc_banned_ngram_tokens(self, prev_input_ids: torch.Tensor,
+                                  num_hypos: int,
+                                  cur_len: int) -> List[Iterable[int]]:
+        """Copied from fairseq for no_repeat_ngram in beam_search"""
+        if cur_len + 1 < self.ngram_size:
+            # return no banned tokens if we haven't generated no_repeat_ngram_size tokens yet
+            return [[] for _ in range(num_hypos)]
+        generated_ngrams = [{} for _ in range(num_hypos)]
+        for idx in range(num_hypos):
+            gen_tokens = prev_input_ids[idx].tolist()
+            generated_ngram = generated_ngrams[idx]
+            for ngram in zip(*[gen_tokens[i:]
+                               for i in range(self.ngram_size)]):
+                prev_ngram_tuple = tuple(ngram[:-1])
+                generated_ngram[prev_ngram_tuple] = generated_ngram.get(
+                    prev_ngram_tuple, []) + [ngram[-1]]
+
+        def _get_generated_ngrams(hypo_idx):
+            # Before decoding the next token, prevent decoding of ngrams that have already appeared
+            start_idx = cur_len + 1 - self.ngram_size
+            ngram_idx = tuple(prev_input_ids[hypo_idx,
+                                             start_idx:cur_len].tolist())
+            return generated_ngrams[hypo_idx].get(ngram_idx, [])
+
+        banned_tokens = [
+            _get_generated_ngrams(hypo_idx) for hypo_idx in range(num_hypos)
+        ]
+        return banned_tokens
diff --git a/modelscope/models/nlp/mglm/mglm_for_text_summarization.py b/modelscope/models/nlp/mglm/mglm_for_text_summarization.py
new file mode 100644
index 00000000..ea1dfb5a
--- /dev/null
+++ b/modelscope/models/nlp/mglm/mglm_for_text_summarization.py
@@ -0,0 +1,469 @@
+# Copyright (c) 2022 Zhipu.AI
+
+import os
+import random
+from os import path as osp
+from typing import Dict
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.metainfo import Models
+from modelscope.models.base import Tensor, TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.outputs import OutputKeys
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from . import mpu
+from .arguments import get_args
+from .generation_utils import BeamSearchScorer
+from .train_utils import get_model
+from .utils import load_checkpoint
+
+__all__ = ['MGLMForTextSummarization']
+
+
+def setup_args(args):
+    args.block_lm = True
+    args.task_mask = True
+    args.cloze_eval = True
+    args.num_layers = 24
+    args.hidden_size = 1536
+    args.num_attention_heads = 16
+    args.max_position_embeddings = 1024
+    args.tokenizer_type = 'ChineseSPTokenizer'
+    args.load_pretrained = ''
+    args.DDP_impl = 'none'
+    args.model_parallel_size = 1
+    args.fp16 = True
+    args.cache_dir = 'cache'
+    args.out_seq_length = 200
+    args.seq_length = 512
+    args.temperature = 0.9
+    args.top_k = 2
+    args.top_p = 0.8
+    args.frequency_penalty = 0.1
+    args.presence_penalty = 0.1
+    args.mem_length = args.seq_length + args.mem_length - 1
+    return args
+
+
+def setup_model(args):
+    """Setup model and optimizer."""
+
+    model = get_model(args, model_type='generation')
+
+    if args.load_pretrained is not None:
+        args.no_load_optim = True
+        args.load = args.load_pretrained
+        _ = load_checkpoint(model, None, None, args)
+
+    return model
+
+
+def set_random_seed(seed):
+    """Set random seed for reproducability."""
+
+    if seed is not None and seed > 0:
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        mpu.model_parallel_cuda_manual_seed(seed)
+
+
+def get_masks_and_position_ids(data,
+                               eod_token,
+                               reset_position_ids,
+                               reset_attention_mask,
+                               loss_mask=None,
+                               attention_mask=None,
+                               set_loss_mask=False,
+                               mem_length=None):
+    # Extract batch size and sequence length.
+    batch_size, seq_length = data.size()
+
+    # Attention mask (lower triangular).
+    if mem_length:
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                (1, seq_length, seq_length + mem_length), device=data.device)
+        attention_mask = torch.tril(
+            torch.triu(attention_mask, 1 - seq_length + mem_length),
+            mem_length)
+    else:
+        if reset_attention_mask:
+            att_mask_batch = batch_size
+        else:
+            att_mask_batch = 1
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                (att_mask_batch, seq_length, seq_length), device=data.device)
+        attention_mask = torch.tril(attention_mask)
+    attention_mask = attention_mask.unsqueeze(1)
+
+    # Loss mask.
+    if loss_mask is None:
+        loss_mask = torch.ones(
+            data.size(), dtype=torch.float, device=data.device)
+
+    # Position ids.
+    position_ids = torch.arange(
+        seq_length, dtype=torch.long, device=data.device)
+    position_ids = position_ids.unsqueeze(0).expand_as(data)
+    if set_loss_mask:
+        loss_mask[data == eod_token] = 0.0
+    # We need to clone as the ids will be modifed based on batch index.
+    if reset_position_ids:
+        position_ids = position_ids.clone()
+
+    if reset_position_ids or reset_attention_mask:
+        # Loop through the batches:
+        for b in range(batch_size):
+
+            # Find indecies where EOD token is.
+            eod_index = position_ids[b, data[b] == eod_token]
+            # Detach indecies from positions if going to modify positions.
+            if reset_position_ids:
+                eod_index = eod_index.clone()
+
+            # Loop through EOD indecies:
+            prev_index = 0
+            for j in range(eod_index.size()[0]):
+                i = eod_index[j]
+                # Mask attention loss.
+                if reset_attention_mask:
+                    attention_mask[b, 0, (i + 1):, :(i + 1)] = 0
+                # Reset positions.
+                if reset_position_ids:
+                    position_ids[b, (i + 1):] -= (i + 1 - prev_index)
+                    prev_index = i + 1
+
+    return attention_mask, loss_mask, position_ids
+
+
+def initialize_distributed(args):
+    """Initialize torch.distributed."""
+
+    # Manually set the device ids.
+    device = args.rank % torch.cuda.device_count()
+    if args.local_rank is not None:
+        device = args.local_rank
+    torch.cuda.set_device(device)
+    # Call the init process
+    init_method = 'tcp://'
+    args.master_ip = os.getenv('MASTER_ADDR', 'localhost')
+    args.master_port = os.getenv('MASTER_PORT', '6000')
+    init_method += args.master_ip + ':' + args.master_port
+    torch.distributed.init_process_group(
+        backend=args.distributed_backend,
+        world_size=args.world_size,
+        rank=args.rank,
+        init_method=init_method)
+
+    # Set the model-parallel / data-parallel communicators.
+    mpu.initialize_model_parallel(args.model_parallel_size)
+
+    # Optional DeepSpeed Activation Checkpointing Features
+    #
+    if hasattr(
+            args, 'deepspeed'
+    ) and args.deepspeed and args.deepspeed_activation_checkpointing:
+        set_deepspeed_activation_checkpointing(args)
+
+
+def get_batch(context_tokens, device, args):
+    tokens = context_tokens
+    tokens = tokens.view(args.batch_size, -1).contiguous()
+    tokens = tokens.to(device)
+
+    # Get the masks and postition ids.
+    if args.block_lm:
+        attention_mask = torch.tensor([tokens.size(1)],
+                                      device=device,
+                                      dtype=torch.long)
+        position_ids = torch.arange(
+            tokens.size(1), device=device, dtype=torch.long)
+        if not args.no_block_position:
+            block_position_ids = torch.zeros(
+                tokens.size(1), device=device, dtype=torch.long)
+            position_ids = torch.stack((position_ids, block_position_ids),
+                                       dim=0)
+        position_ids = position_ids.unsqueeze(0)
+    else:
+        attention_mask, loss_mask, position_ids = get_masks_and_position_ids(
+            tokens,
+            args.eod_token,
+            reset_position_ids=False,
+            reset_attention_mask=False,
+            set_loss_mask=False,
+            mem_length=args.mem_length)
+
+    return tokens, attention_mask, position_ids
+
+
+def top_k_logits(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
+    # This function has been mostly taken from huggingface conversational ai code at
+    # https://medium.com/huggingface/how-to-build-a-state-of-the-art-conversational-ai-with-transfer-learning-2d818ac26313
+
+    if top_k > 0:
+        # Remove all tokens with a probability less than the last token of the top-k
+        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1,
+                                                                  None]
+        logits[indices_to_remove] = filter_value
+
+    if top_p > 0.0:
+        # convert to 1D
+        logits = logits.view(logits.size()[1]).contiguous()
+        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+        cumulative_probs = torch.cumsum(
+            F.softmax(sorted_logits, dim=-1), dim=-1)
+
+        # Remove tokens with cumulative probability above the threshold
+        sorted_indices_to_remove = cumulative_probs > top_p
+        # Shift the indices to the right to keep also the first token above the threshold
+        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
+            ..., :-1].clone()
+        sorted_indices_to_remove[..., 0] = 0
+        indices_to_remove = sorted_indices[sorted_indices_to_remove]
+        logits[indices_to_remove] = filter_value
+        # going back to 2D
+        logits = logits.view(1, -1).contiguous()
+
+    return logits
+
+
+def sample_sequence(model,
+                    tokenizer,
+                    context_tokens,
+                    context_length,
+                    args,
+                    device,
+                    mems=None,
+                    end_tokens=None):
+    if not args.block_lm:
+        context_tokens, attention_mask, position_ids = get_batch(
+            context_tokens, device, args)
+        tokens = torch.empty((args.num_beams, 0),
+                             device=context_tokens.device,
+                             dtype=torch.long)
+    else:
+        tokens = context_tokens.new_full((1, 1),
+                                         tokenizer.get_command('sop').Id)
+    counter = 0
+    if mems is None:
+        mems = []
+    if end_tokens is None:
+        end_tokens = [args.eod_token]
+
+    last_beam_num = 1
+    output_tokens_list = []
+    generated_tokens_list = []
+
+    while counter < args.out_seq_length:
+        if counter == 0 and not args.block_lm:
+            next_token_logits, *mems = model(context_tokens, position_ids,
+                                             attention_mask, *mems)
+        else:
+            if args.block_lm:
+                if args.no_block_position:
+                    position_ids = context_tokens.new_full(
+                        (last_beam_num, 1), context_length + counter)
+                else:
+                    position_ids = context_tokens.new_ones(last_beam_num, 2, 1)
+                    position_ids[:, 0] = context_length
+                    position_ids[:, 1] = counter + 1
+                attention_mask = context_tokens.new_zeros(
+                    [1], device=context_tokens.device, dtype=torch.long)
+            else:
+                position_ids = context_tokens.new_ones((last_beam_num, 1)) * (
+                    context_length + counter - 1)
+                attention_mask = context_tokens.new_ones(
+                    last_beam_num,
+                    1,
+                    1,
+                    args.mem_length + 1,
+                    device=context_tokens.device,
+                    dtype=torch.float)
+            last_token = tokens[:, -1:]
+            next_token_logits, *mems = model(last_token, position_ids,
+                                             attention_mask, *mems)
+        next_token_logits = next_token_logits[:, -1]
+
+        next_token_logits /= args.temperature
+        frequency_count = torch.zeros(next_token_logits.shape)
+        for tk in output_tokens_list:
+            frequency_count[0][tk] += 1
+
+        next_token_logits -= (args.frequency_penalty
+                              * frequency_count).to(device)
+        next_token_logits -= (
+            args.presence_penalty *  # noqa
+            (frequency_count > 0)).to(device)
+
+        next_token_logits = top_k_logits(
+            next_token_logits, top_k=args.top_k, top_p=args.top_p)
+        log_probs = F.softmax(next_token_logits, dim=-1)
+        prev = torch.multinomial(log_probs, num_samples=1)[0]
+        is_end = prev.item() in end_tokens
+        if is_end:
+            break
+        decode_tokens = tokenizer.DecodeIds([prev.item()])  # noqa
+        generated_tokens_list.append(prev.item())
+        prev = prev.view(1, 1)
+        tokens = prev if tokens is None else torch.cat((tokens, prev), dim=1)
+        counter += 1
+        output_tokens_list = tokens.view(-1).contiguous()
+    return torch.cat((context_tokens, tokens), dim=1), mems
+
+
+def read_context(tokenizer, args, context):
+    terminate_runs, skip_run = 0, 0  # noqa
+    if mpu.get_model_parallel_rank() == 0:
+        while True:
+            # raw_text = input("\nContext prompt (stop to exit) >>> ")
+            raw_text = context
+            if not raw_text:
+                print('Prompt should not be empty!')
+                break
+            # if raw_text == "stop":
+            #     terminate_runs = 1
+            #     break
+            generation_mask = '[gMASK]' if args.task_mask else '[MASK]'
+            if args.block_lm and 'MASK]' not in raw_text:
+                raw_text += ' ' + generation_mask
+            # output.write(raw_text)
+            context_tokens = tokenizer.EncodeAsIds(raw_text).tokenization
+            if args.block_lm:
+                context_tokens = [tokenizer.get_command('ENC').Id
+                                  ] + context_tokens
+                if not raw_text.endswith('[gMASK]'):
+                    context_tokens = context_tokens + [
+                        tokenizer.get_command('eos').Id
+                    ]
+            context_length = len(context_tokens)
+
+            if context_length >= args.seq_length:
+                print('\nContext length', context_length,
+                      '\nPlease give smaller context than the window length!')
+                break
+            break
+    else:
+        context_length = 0
+
+    terminate_runs_tensor = torch.cuda.LongTensor([terminate_runs])
+    torch.distributed.broadcast(
+        terminate_runs_tensor,
+        mpu.get_model_parallel_src_rank(),
+        group=mpu.get_model_parallel_group())
+    terminate_runs = terminate_runs_tensor[0].item()
+
+    if terminate_runs == 1:
+        return terminate_runs, None, None, None
+
+    context_length_tensor = torch.cuda.LongTensor([context_length])
+
+    torch.distributed.broadcast(
+        context_length_tensor,
+        mpu.get_model_parallel_src_rank(),
+        group=mpu.get_model_parallel_group())
+    context_length = context_length_tensor[0].item()
+    if mpu.get_model_parallel_rank() == 0:
+        context_tokens_tensor = torch.cuda.LongTensor(context_tokens)
+    else:
+        context_tokens_tensor = torch.cuda.LongTensor([0] * context_length)
+    torch.distributed.broadcast(
+        context_tokens_tensor,
+        mpu.get_model_parallel_src_rank(),
+        group=mpu.get_model_parallel_group())
+    if mpu.get_model_parallel_rank() != 0:
+        raw_text = tokenizer.DecodeIds(context_tokens_tensor.tolist())
+    return terminate_runs, raw_text, context_tokens_tensor, context_length
+
+
+@MODELS.register_module(Tasks.text_summarization, module_name=Models.mglm)
+class MGLMForTextSummarization(TorchModel):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the text summarization model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+        """
+        super().__init__(model_dir, *args, **kwargs)
+
+        from .configure_data import prepare_tokenizer
+        # Disable CuDNN.
+        torch.backends.cudnn.enabled = False
+        # Arguments.
+        self.args = setup_args(get_args())
+        self.args.load_pretrained = model_dir
+        # Pytorch distributed.
+        try:
+            initialize_distributed(self.args)
+        except (RuntimeError):
+            print('group process initialized twice')
+        # Random seeds for reproducability.
+        set_random_seed(self.args.seed)
+        # setting default batch size to 1
+        self.args.batch_size = 1
+        self.args.tokenizer_path = model_dir
+        self.tokenizer = prepare_tokenizer(self.args)
+        self.model = setup_model(self.args)
+        self.cfg = Config.from_file(
+            osp.join(model_dir, ModelFile.CONFIGURATION))
+
+    def forward(self, input: Dict[str, str]) -> Dict[str, str]:
+        pass
+
+    def generate(self, input: Dict[str, str]) -> Dict[str, str]:
+        model = self.model
+        tokenizer = self.tokenizer
+        args = self.args
+        device = torch.cuda.current_device()
+        model.eval()
+
+        context = input['text'] + self.cfg.model.prompt
+        with torch.no_grad():
+            terminate_runs, raw_text, context_tokens_tensor, context_length = read_context(
+                tokenizer, args, context)
+            mems = []
+            tokens, attention_mask, position_ids = get_batch(
+                context_tokens_tensor, device, args)
+            mask_tokens = ['MASK', 'sMASK', 'gMASK'
+                           ] if args.task_mask else ['MASK']
+            mask_tokens = [
+                tokenizer.get_command(token).Id for token in mask_tokens
+            ]
+            end_tokens = [tokenizer.get_command('eop').Id, args.eod_token]
+
+            mask_positions = []
+            for token in mask_tokens:
+                mask_positions += (context_tokens_tensor == token).nonzero(
+                    as_tuple=True)[0].tolist()
+            mask_positions.sort()
+            if args.no_block_position:
+                for mask_position in mask_positions:
+                    position_ids[0, mask_position + 1:] += args.out_seq_length
+            _, *mems = model(tokens, position_ids, attention_mask, *mems)
+            for mask_position in mask_positions:
+                if args.no_block_position:
+                    position = position_ids[0, mask_position].item()
+                else:
+                    position = mask_position
+                tokens, mems, = sample_sequence(
+                    model,
+                    tokenizer,
+                    tokens,
+                    position,
+                    args,
+                    device,
+                    mems=mems,
+                    end_tokens=end_tokens)
+            output_tokens_list = tokens.view(-1).contiguous()
+            trim_decode_tokens = tokenizer.DecodeIds(
+                output_tokens_list.tolist())
+            res = trim_decode_tokens.split('<|startofpiece|>')[-1]
+            print(res)
+        return {OutputKeys.TEXT: res}
diff --git a/modelscope/models/nlp/mglm/model/__init__.py b/modelscope/models/nlp/mglm/model/__init__.py
new file mode 100755
index 00000000..84c55ae3
--- /dev/null
+++ b/modelscope/models/nlp/mglm/model/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .distributed import (DistributedDataParallel,
+                          PyTorchDistributedDataParallel)
+from .downstream import (GLMForMultiTokenCloze, GLMForMultiTokenClozeFast,
+                         GLMForSequenceClassification, GLMForSingleTokenCloze)
+from .modeling_glm import (GLMModel,
+                           glm_get_params_for_weight_decay_optimization)
diff --git a/modelscope/models/nlp/mglm/model/distributed.py b/modelscope/models/nlp/mglm/model/distributed.py
new file mode 100755
index 00000000..a3c84e9f
--- /dev/null
+++ b/modelscope/models/nlp/mglm/model/distributed.py
@@ -0,0 +1,127 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.distributed as dist
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+from torch.autograd import Variable
+from torch.nn.modules import Module
+from torch.nn.parallel.distributed import DistributedDataParallel as DDP
+
+from modelscope.models.nlp.mglm import mpu
+
+
+class PyTorchDistributedDataParallel(DDP):
+
+    def named_parameters(self, prefix: str = '', recurse: bool = True):
+        return self.module.named_parameters(prefix=prefix, recurse=recurse)
+
+    def state_dict(self, destination=None, prefix='', keep_vars=False):
+        sd = self.module.state_dict(destination, prefix, keep_vars)
+        return sd
+
+    def load_state_dict(self, state_dict, strict=True):
+        return self.module.load_state_dict(state_dict, strict=strict)
+
+
+class DistributedDataParallel(Module):
+
+    def __init__(self, module):
+        super(DistributedDataParallel, self).__init__()
+        self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
+
+        self.module = module
+        self.data_parallel_group = mpu.get_data_parallel_group()
+        src_rank = mpu.get_model_parallel_rank()
+        for p in self.module.parameters():
+            if torch.is_tensor(p):
+                dist.broadcast(p, src_rank, group=self.data_parallel_group)
+
+        def allreduce_params(reduce_after=True,
+                             no_scale=False,
+                             fp32_allreduce=False):
+            if (self.needs_reduction):
+                self.needs_reduction = False
+                buckets = {}
+                for name, param in self.module.named_parameters():
+                    if param.requires_grad and param.grad is not None:
+                        tp = (param.data.type())
+                        if tp not in buckets:
+                            buckets[tp] = []
+                        buckets[tp].append(param)
+                if self.warn_on_half:
+                    if torch.cuda.HalfTensor in buckets:
+                        print(
+                            'WARNING: gloo dist backend for half parameters may be extremely slow. It is recommended to use the NCCL backend in this case.'  # noqa
+                        )
+                        self.warn_on_half = False
+                for tp in buckets:
+                    bucket = buckets[tp]
+                    grads = [param.grad.data for param in bucket]
+                    coalesced = _flatten_dense_tensors(grads)
+                    if fp32_allreduce:
+                        coalesced = coalesced.float()
+                    if not no_scale and not reduce_after:
+                        coalesced /= dist.get_world_size(
+                            group=self.data_parallel_group)
+                    dist.all_reduce(coalesced, group=self.data_parallel_group)
+                    torch.cuda.synchronize()
+                    if not no_scale and reduce_after:
+                        coalesced /= dist.get_world_size(
+                            group=self.data_parallel_group)
+                    for buf, synced in zip(
+                            grads, _unflatten_dense_tensors(coalesced, grads)):
+                        buf.copy_(synced)
+
+        self.hook_handles = []
+        self.hooks = []
+        for param in list(self.module.parameters()):
+
+            def allreduce_hook(*unused):
+                Variable._execution_engine.queue_callback(allreduce_params)
+
+        self.allreduce_params = allreduce_params
+
+    def forward(self, *inputs, **kwargs):
+        self.needs_reduction = True
+        return self.module(*inputs, **kwargs)
+
+    def state_dict(self, destination=None, prefix='', keep_vars=False):
+        sd = self.module.state_dict(destination, prefix, keep_vars)
+        return sd
+
+    def load_state_dict(self, state_dict, strict=True):
+        return self.module.load_state_dict(state_dict, strict=strict)
+
+    def named_parameters(self, prefix: str = '', recurse: bool = True):
+        return self.module.named_parameters(prefix=prefix, recurse=recurse)
+
+    '''
+    def _sync_buffers(self):
+        buffers = list(self.module._all_buffers())
+        if len(buffers) > 0:
+            # cross-node buffer sync
+            flat_buffers = _flatten_dense_tensors(buffers)
+            dist.broadcast(flat_buffers, 0)
+            for buf, synced in zip(buffers, _unflatten_dense_tensors(flat_buffers, buffers)):
+                buf.copy_(synced)
+    def train(self, mode=True):
+        # Clear NCCL communicator and CUDA event cache of the default group ID,
+        # These cache will be recreated at the later call. This is currently a
+        # work-around for a potential NCCL deadlock.
+        if dist._backend == dist.dist_backend.NCCL:
+            dist._clear_group_cache()
+        super(DistributedDataParallel, self).train(mode)
+        self.module.train(mode)
+    '''
diff --git a/modelscope/models/nlp/mglm/model/downstream.py b/modelscope/models/nlp/mglm/model/downstream.py
new file mode 100644
index 00000000..61b1e807
--- /dev/null
+++ b/modelscope/models/nlp/mglm/model/downstream.py
@@ -0,0 +1,242 @@
+# Copyright (c) 2022 Zhipu.AI
+"""Multiple choice model."""
+
+import torch
+import torch.nn
+
+from .modeling_glm import GLMModel
+
+
+class GLMForMultiTokenCloze(torch.nn.Module):
+
+    def __init__(self,
+                 language_model: GLMModel,
+                 take_softmax=True,
+                 length_penalty=0.0):
+        super(GLMForMultiTokenCloze, self).__init__()
+        self.model = language_model
+        self.take_softmax = take_softmax
+        self.length_penalty = length_penalty
+
+    def state_dict(self, destination=None, prefix='', keep_vars=False):
+        # [h.remove() for h in self.hook_handles]
+        sd = self.model.state_dict(destination, prefix, keep_vars)
+        return sd
+
+    def load_state_dict(self, state_dict, strict=True):
+        return self.model.load_state_dict(state_dict, strict=strict)
+
+    def named_parameters(self, prefix: str = '', recurse: bool = True):
+        return self.model.named_parameters(prefix=prefix, recurse=recurse)
+
+    def forward(self,
+                input_ids,
+                position_ids,
+                attention_mask,
+                target_ids=None,
+                logit_mask=None,
+                prompt_pos=None):
+        if target_ids is None:
+            return self.model(input_ids, position_ids, attention_mask)
+        num_choices = None
+        if len(input_ids.shape) == 3:
+            batch_size, num_choices = input_ids.shape[:2]
+            input_ids = input_ids.reshape(-1, input_ids.size(-1))
+            attention_mask = attention_mask.reshape(-1,
+                                                    *attention_mask.size()[2:])
+            position_ids = position_ids.reshape(-1, *position_ids.size()[2:])
+            target_ids = target_ids.reshape(-1, target_ids.size(-1))
+            logit_mask = logit_mask.reshape(-1, logit_mask.size(-1))
+            if prompt_pos is not None:
+                prompt_pos = prompt_pos.reshape(-1, prompt_pos.size(-1))
+        outputs, *mems = self.model(
+            input_ids, position_ids, attention_mask, prompt_pos=prompt_pos)
+        if self.take_softmax:
+            outputs = torch.nn.functional.log_softmax(outputs, dim=-1)
+        # select the target logits
+        batch_ids = torch.arange(
+            target_ids.size(0), dtype=torch.long, device=target_ids.device)
+        batch_ids = batch_ids.unsqueeze(1).expand_as(target_ids)
+        seq_ids = torch.arange(
+            target_ids.size(-1), dtype=torch.long, device=target_ids.device)
+        seq_ids = seq_ids.unsqueeze(0).expand_as(target_ids)
+        logits = outputs[batch_ids, seq_ids, target_ids]
+        logits = (logits * logit_mask).sum(dim=1)
+        if self.length_penalty > 0.0:
+            logits = logits / logit_mask.sum(dim=1)**self.length_penalty
+        if num_choices is not None:
+            logits = logits.view(-1, num_choices)
+        return (logits, *mems)
+
+
+class GLMForMultiTokenClozeFast(torch.nn.Module):
+
+    def __init__(self, language_model, take_softmax=True, length_penalty=0.0):
+        super(GLMForMultiTokenClozeFast, self).__init__()
+        self.model = language_model
+        self.take_softmax = take_softmax
+        self.length_penalty = length_penalty
+
+    def forward(self, input_ids, position_ids, attention_mask, dec_input_ids,
+                dec_position_ids, dec_attention_mask, dec_target_ids,
+                dec_logit_mask):
+        # encoder
+        outputs, *mems = self.model(
+            input_ids,
+            position_ids,
+            attention_mask,
+            return_memory=True,
+            detach_memory=False)
+        batch_size, num_choices, max_dec_len = dec_input_ids.size()
+        max_enc_len = input_ids.size(-1)
+
+        enc_mems = []
+        for hidden in mems:
+            hidden = hidden.unsqueeze(1).expand(-1, num_choices, -1,
+                                                -1).reshape(
+                                                    batch_size * num_choices,
+                                                    *hidden.size()[1:])
+            enc_mems.append(hidden)
+
+        def build_dec_mask_matrix(seq_length, sep, memory_length=0):
+            m = enc_mems[0].new_ones((1, seq_length, seq_length))
+            m = torch.tril(m)
+
+            # sep = dec_attention_mask
+            ids = torch.arange(
+                memory_length, device=sep.device, dtype=sep.dtype).view(1, -1)
+            mask = ids < sep.view(-1, 1)  # batch * mem
+            mask = mask.unsqueeze(1).float().expand(-1, seq_length, -1)
+
+            m = m.expand(batch_size * num_choices, -1, -1)
+            m = torch.cat((mask, m), dim=2)
+            m = m.unsqueeze(1)
+            return m
+
+        dec_input_ids = dec_input_ids.reshape(-1, max_dec_len)
+        dec_position_ids = dec_position_ids.reshape(
+            -1,
+            *dec_position_ids.size()[2:])
+        # dec_attention_mask = dec_attention_mask.reshape(-1, *dec_attention_mask.size()[2:]).unsqueeze(1)
+        dec_attention_mask = build_dec_mask_matrix(
+            max_dec_len, dec_attention_mask.reshape(-1), max_enc_len)
+        dec_target_ids = dec_target_ids.reshape(-1, dec_target_ids.size(-1))
+        dec_logit_mask = dec_logit_mask.reshape(-1, dec_logit_mask.size(-1))
+
+        outputs, *mems = self.model(dec_input_ids, dec_position_ids,
+                                    dec_attention_mask, *enc_mems)
+        if self.take_softmax:
+            outputs = torch.nn.functional.log_softmax(outputs, dim=-1)
+
+        batch_ids = torch.arange(
+            dec_target_ids.size(0),
+            dtype=torch.long,
+            device=dec_target_ids.device)
+        batch_ids = batch_ids.unsqueeze(1).expand_as(dec_target_ids)
+        seq_ids = torch.arange(
+            dec_target_ids.size(-1),
+            dtype=torch.long,
+            device=dec_target_ids.device)
+        seq_ids = seq_ids.unsqueeze(0).expand_as(dec_target_ids)
+        logits = outputs[batch_ids, seq_ids, dec_target_ids]
+        logits = (logits * dec_logit_mask).sum(dim=1)
+        if self.length_penalty > 0.0:
+            logits = logits / dec_logit_mask.sum(dim=1)**self.length_penalty
+        if num_choices is not None:
+            logits = logits.view(-1, num_choices)
+        return (logits, *mems)
+
+
+class GLMForSingleTokenCloze(torch.nn.Module):
+
+    def __init__(self, language_model, take_softmax=False):
+        super().__init__()
+        self.model = language_model
+        self.take_softmax = take_softmax
+
+    def state_dict(self, destination=None, prefix='', keep_vars=False):
+        # [h.remove() for h in self.hook_handles]
+        sd = self.model.state_dict(destination, prefix, keep_vars)
+        return sd
+
+    def load_state_dict(self, state_dict, strict=True):
+        return self.model.load_state_dict(state_dict, strict=strict)
+
+    def named_parameters(self, prefix: str = '', recurse: bool = True):
+        return self.model.named_parameters(prefix=prefix, recurse=recurse)
+
+    def forward(self,
+                input_ids,
+                position_ids,
+                attention_mask,
+                target_ids=None,
+                logit_mask=None,
+                prompt_pos=None):
+        if target_ids is None:
+            return self.model(input_ids, position_ids, attention_mask)
+        assert len(input_ids.shape) == 2
+        outputs, *mems = self.model(
+            input_ids, position_ids, attention_mask, prompt_pos=prompt_pos)
+        batch_ids = torch.arange(
+            outputs.size(0),
+            dtype=attention_mask.dtype,
+            device=attention_mask.device)
+        target_logits = outputs[batch_ids, attention_mask]
+        if self.take_softmax:
+            target_prob = torch.nn.functional.log_softmax(
+                target_logits, dim=-1)
+        else:
+            target_prob = target_logits
+        batch_ids = batch_ids.unsqueeze(1).expand_as(target_ids)
+        output = target_prob[batch_ids, target_ids]
+
+        return (output, target_logits, *mems)
+
+
+class GLMForSequenceClassification(torch.nn.Module):
+
+    def __init__(self,
+                 language_model,
+                 hidden_size,
+                 hidden_dropout,
+                 pool_token,
+                 num_class=1):
+        super().__init__()
+        self.pool_token = pool_token
+        self.model = language_model
+        self.num_class = num_class
+        # Multi-choice head.
+        self.pool_layer = torch.nn.Linear(hidden_size, hidden_size)
+        self.multichoice_dropout = torch.nn.Dropout(hidden_dropout)
+        self.multichoice_head = torch.nn.Linear(hidden_size, num_class)
+
+    def forward(self, input_ids, position_ids, attention_mask):
+        num_choices = None
+        if len(input_ids.shape) == 3:
+            assert self.num_class == 1
+            batch_size, num_choices = input_ids.shape[:2]
+            input_ids = input_ids.reshape(-1, input_ids.size(-1))
+            attention_mask = attention_mask.reshape(-1,
+                                                    *attention_mask.size()[2:])
+            position_ids = position_ids.reshape(-1, *position_ids.size()[2:])
+        outputs, *mems = self.model(input_ids, position_ids, attention_mask)
+        if self.pool_token == 'start':
+            output = outputs[torch.arange(
+                outputs.size(0),
+                dtype=attention_mask.dtype,
+                device=attention_mask.device), attention_mask]
+        elif self.pool_token == 'pad':
+            output = outputs[torch.arange(
+                outputs.size(0),
+                dtype=attention_mask.dtype,
+                device=attention_mask.device), attention_mask - 1]
+        elif self.pool_token == 'cls':
+            output = outputs[:, 0]
+        else:
+            raise NotImplementedError
+        output = torch.tanh(self.pool_layer(output))
+        multichoice_output = self.multichoice_dropout(output)
+        logits = self.multichoice_head(multichoice_output)
+        if num_choices is not None:
+            logits = logits.view(-1, num_choices)
+        return (logits, *mems)
diff --git a/modelscope/models/nlp/mglm/model/modeling_bert.py b/modelscope/models/nlp/mglm/model/modeling_bert.py
new file mode 100644
index 00000000..965f82a7
--- /dev/null
+++ b/modelscope/models/nlp/mglm/model/modeling_bert.py
@@ -0,0 +1,1576 @@
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch BERT model."""
+
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+import copy
+import logging
+import math
+import os
+import shutil
+import tarfile
+import tempfile
+
+import json
+import mpu
+import torch
+import torch.nn.functional as F
+from data_utils.file_utils import cached_path
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+# from torch.utils.checkpoint import checkpoint
+
+
+def normal_init_method(mean, std):
+
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=mean, std=std)
+
+    return init_
+
+
+def scaled_init_method(mean, std, num_layers):
+    """Init method based on N(0, sigma/sqrt(2*num_layers)."""
+    std = std / math.sqrt(2.0 * num_layers)
+
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=mean, std=std)
+
+    return init_
+
+
+def bert_extended_attention_mask(attention_mask):
+    # We create a 3D attention mask from a 2D tensor mask.
+    # [b, 1, s]
+    attention_mask_b1s = attention_mask.unsqueeze(1)
+    # [b, s, 1]
+    attention_mask_bs1 = attention_mask.unsqueeze(2)
+    # [b, s, s]
+    attention_mask_bss = attention_mask_b1s * attention_mask_bs1
+    # [b, 1, s, s]
+    extended_attention_mask = attention_mask_bss.unsqueeze(1)
+
+    return extended_attention_mask
+
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+PRETRAINED_MODEL_ARCHIVE_MAP = {
+    'bert-base-uncased':
+    '/root/data/bert-base-uncased.tar.gz',
+    'bert-large-uncased':
+    'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased.tar.gz',
+    'bert-base-cased':
+    'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased.tar.gz',
+    'bert-large-cased':
+    'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased.tar.gz',
+    'bert-base-multilingual-uncased':
+    'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased.tar.gz',
+    'bert-base-multilingual-cased':
+    'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased.tar.gz',
+    'bert-base-chinese':
+    'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz',
+}
+CONFIG_NAME = 'bert_config.json'
+WEIGHTS_NAME = 'pytorch_model.bin'
+TF_WEIGHTS_NAME = 'model.ckpt'
+
+
+def load_tf_weights_in_bert(model, tf_checkpoint_path):
+    """ Load tf checkpoints in a pytorch model
+    """
+    try:
+        import re
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        print(
+            'Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see '
+            'https://www.tensorflow.org/install/ for installation instructions.'
+        )
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    print('Converting TensorFlow checkpoint from {}'.format(tf_path))
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        print('Loading TF weight {} with shape {}'.format(name, shape))
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+
+    for name, array in zip(names, arrays):
+        name = name.split('/')
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(n in ['adam_v', 'adam_m'] for n in name):
+            print('Skipping {}'.format('/'.join(name)))
+            continue
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
+                l = re.split(r'_(\d+)', m_name)  # noqa
+            else:
+                l = [m_name]  # noqa
+            if l[0] == 'kernel' or l[0] == 'gamma':
+                pointer = getattr(pointer, 'weight')
+            elif l[0] == 'output_bias' or l[0] == 'beta':
+                pointer = getattr(pointer, 'bias')
+            elif l[0] == 'output_weights':
+                pointer = getattr(pointer, 'weight')
+            else:
+                pointer = getattr(pointer, l[0])
+            if len(l) >= 2:
+                num = int(l[1])
+                pointer = pointer[num]
+        if m_name[-11:] == '_embeddings':
+            pointer = getattr(pointer, 'weight')
+        elif m_name == 'kernel':
+            array = np.transpose(array)
+        try:
+            assert pointer.shape == array.shape
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        print('Initialize PyTorch weight {}'.format(name))
+        pointer.data = torch.from_numpy(array)
+    return model
+
+
+def gelu(x):
+    """Implementation of the gelu activation function.
+        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
+        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+    """
+    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
+
+
+def swish(x):
+    return x * torch.sigmoid(x)
+
+
+ACT2FN = {'gelu': gelu, 'relu': torch.nn.functional.relu, 'swish': swish}
+
+
+class BertConfig(object):
+    """Configuration class to store the configuration of a `BertModel`.
+    """
+
+    def __init__(self,
+                 vocab_size_or_config_json_file,
+                 hidden_size=768,
+                 num_hidden_layers=12,
+                 num_attention_heads=12,
+                 intermediate_size=3072,
+                 hidden_act='gelu',
+                 hidden_dropout_prob=0.1,
+                 attention_probs_dropout_prob=0.1,
+                 max_position_embeddings=512,
+                 type_vocab_size=2,
+                 initializer_range=0.02,
+                 deep_init=False,
+                 fp32_layernorm=False,
+                 fp32_embedding=False,
+                 fp32_tokentypes=False,
+                 layernorm_epsilon=1e-12):
+        """Constructs BertConfig.
+
+        Args:
+            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
+            hidden_size: Size of the encoder layers and the pooler layer.
+            num_hidden_layers: Number of hidden layers in the Transformer encoder.
+            num_attention_heads: Number of attention heads for each attention layer in
+                the Transformer encoder.
+            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
+                layer in the Transformer encoder.
+            hidden_act: The non-linear activation function (function or string) in the
+                encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
+            hidden_dropout_prob: The dropout probabilitiy for all fully connected
+                layers in the embeddings, encoder, and pooler.
+            attention_probs_dropout_prob: The dropout ratio for the attention
+                probabilities.
+            max_position_embeddings: The maximum sequence length that this model might
+                ever be used with. Typically set this to something large just in case
+                (e.g., 512 or 1024 or 2048).
+            type_vocab_size: The vocabulary size of the `token_type_ids` passed into
+                `BertModel`.
+            initializer_range: The sttdev of the truncated_normal_initializer for
+                initializing all weight matrices.
+        """
+        if isinstance(vocab_size_or_config_json_file, str):
+            with open(
+                    vocab_size_or_config_json_file, 'r',
+                    encoding='utf-8') as reader:
+                json_config = json.loads(reader.read())
+            for key, value in json_config.items():
+                self.__dict__[key] = value
+        elif isinstance(vocab_size_or_config_json_file, int):
+            self.vocab_size = vocab_size_or_config_json_file
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.hidden_act = hidden_act
+            self.intermediate_size = intermediate_size
+            self.hidden_dropout_prob = hidden_dropout_prob
+            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+            self.max_position_embeddings = max_position_embeddings
+            self.type_vocab_size = type_vocab_size
+            self.initializer_range = initializer_range
+            self.deep_init = deep_init
+            self.fp32_layernorm = fp32_layernorm
+            self.fp32_embedding = fp32_embedding
+            self.layernorm_epsilon = layernorm_epsilon
+            self.fp32_tokentypes = fp32_tokentypes
+        else:
+            raise ValueError(
+                'First argument must be either a vocabulary size (int)'
+                'or the path to a pretrained model config file (str)')
+
+    @classmethod
+    def from_dict(cls, json_object):
+        """Constructs a `BertConfig` from a Python dictionary of parameters."""
+        config = BertConfig(vocab_size_or_config_json_file=-1)
+        for key, value in json_object.items():
+            config.__dict__[key] = value
+        return config
+
+    @classmethod
+    def from_json_file(cls, json_file):
+        """Constructs a `BertConfig` from a json file of parameters."""
+        with open(json_file, 'r', encoding='utf-8') as reader:
+            text = reader.read()
+        return cls.from_dict(json.loads(text))
+
+    def __repr__(self):
+        return str(self.to_json_string())
+
+    def to_dict(self):
+        """Serializes this instance to a Python dictionary."""
+        output = copy.deepcopy(self.__dict__)
+        return output
+
+    def to_json_string(self):
+        """Serializes this instance to a JSON string."""
+        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + '\n'
+
+
+try:
+    from apex.normalization.fused_layer_norm import FusedLayerNorm as BertLayerNorm
+except ImportError:
+    print(
+        'Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.'
+    )
+
+    class BertLayerNorm(nn.Module):
+
+        def __init__(self, hidden_size, eps=1e-12):
+            """Construct a layernorm module in the TF style (epsilon inside the square root).
+            """
+            super(BertLayerNorm, self).__init__()
+            self.weight = nn.Parameter(torch.ones(hidden_size))
+            self.bias = nn.Parameter(torch.zeros(hidden_size))
+            self.variance_epsilon = eps
+
+        def forward(self, x):
+            u = x.mean(-1, keepdim=True)
+            s = (x - u).pow(2).mean(-1, keepdim=True)
+            x = (x - u) / torch.sqrt(s + self.variance_epsilon)
+            return self.weight * x + self.bias
+
+
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings.
+    """
+
+    def __init__(self, config):
+        super(BertEmbeddings, self).__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size,
+                                            config.hidden_size)
+        # self.word_embeddings = mpu.VocabParallelEmbedding(
+        #     config.vocab_size, config.hidden_size,
+        #     init_method=normal_init_method(mean=0.0,
+        #                                    std=config.initializer_range))
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
+                                                config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
+                                                  config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.fp32_layernorm = config.fp32_layernorm
+        self.fp32_embedding = config.fp32_embedding
+        self.fp32_tokentypes = config.fp32_tokentypes
+        self.LayerNorm = BertLayerNorm(
+            config.hidden_size, eps=config.layernorm_epsilon)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, input_ids, token_type_ids=None):
+        seq_length = input_ids.size(1)
+        position_ids = torch.arange(
+            seq_length, dtype=torch.long, device=input_ids.device)
+        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+
+        words_embeddings = self.word_embeddings(input_ids)
+        position_embeddings = self.position_embeddings(position_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+        if not self.fp32_tokentypes:
+
+            embeddings = words_embeddings + position_embeddings + token_type_embeddings
+            if self.fp32_embedding and not self.fp32_layernorm:
+                embeddings = embeddings.half()
+            previous_type = embeddings.type()
+            if self.fp32_layernorm:
+                embeddings = embeddings.float()
+            embeddings = self.LayerNorm(embeddings)
+            if self.fp32_layernorm:
+                if self.fp32_embedding:
+                    embeddings = embeddings.half()
+                else:
+                    embeddings = embeddings.type(previous_type)
+        else:
+            embeddings = words_embeddings.float() + position_embeddings.float(
+            ) + token_type_embeddings.float()
+            if self.fp32_tokentypes and not self.fp32_layernorm:
+                embeddings = embeddings.half()
+            previous_type = embeddings.type()
+            if self.fp32_layernorm:
+                embeddings = embeddings.float()
+            embeddings = self.LayerNorm(embeddings)
+            if self.fp32_layernorm:
+                if self.fp32_tokentypes:
+                    embeddings = embeddings.half()
+                else:
+                    embeddings = embeddings.type(previous_type)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class BertSelfAttention(nn.Module):
+
+    def __init__(self, config):
+        super(BertSelfAttention, self).__init__()
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                'The hidden size (%d) is not a multiple of the number of attention '
+                'heads (%d)' %
+                (config.hidden_size, config.num_attention_heads))
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size
+                                       / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads,
+                                       self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(self, hidden_states, attention_mask):
+        mixed_query_layer = self.query(hidden_states)
+        mixed_key_layer = self.key(hidden_states)
+        mixed_value_layer = self.value(hidden_states)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        key_layer = self.transpose_for_scores(mixed_key_layer)
+        value_layer = self.transpose_for_scores(mixed_value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer,
+                                        key_layer.transpose(-1, -2))
+        attention_scores = attention_scores / math.sqrt(
+            self.attention_head_size)
+        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+        attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        previous_type = attention_probs.type()  # noqa
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (
+            self.all_head_size, )
+        context_layer = context_layer.view(*new_context_layer_shape)
+        return context_layer
+
+
+class BertSelfOutput(nn.Module):
+
+    def __init__(self, config):
+        super(BertSelfOutput, self).__init__()
+        if hasattr(config, 'deep_init') and config.deep_init:
+            init_method = scaled_init_method(
+                mean=0.0,
+                std=config.initializer_range,
+                num_layers=config.num_hidden_layers)
+        else:
+            init_method = normal_init_method(  # noqa
+                mean=0.0, std=config.initializer_range)
+        self.dense = nn.Linear(
+            config.hidden_size, config.hidden_size, bias=True)
+        # self.dense = mpu.RowParallelLinear(
+        #     input_size=config.hidden_size,
+        #     output_size=config.hidden_size,
+        #     bias=True,
+        #     input_is_parallel=True,
+        #     stride=1,
+        #     init_method=init_method)
+        self.fp32_layernorm = config.fp32_layernorm
+        self.LayerNorm = BertLayerNorm(
+            config.hidden_size, eps=config.layernorm_epsilon)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        ln_input = hidden_states + input_tensor
+        previous_type = ln_input.type()
+        if self.fp32_layernorm:
+            ln_input = ln_input.float()
+        hidden_states = self.LayerNorm(ln_input)
+        if self.fp32_layernorm:
+            hidden_states = hidden_states.type(previous_type)
+        return hidden_states
+
+
+class BertAttention(nn.Module):
+
+    def __init__(self, config):
+        super(BertAttention, self).__init__()
+        self.self = BertSelfAttention(config)
+        # self.self = mpu.BertParallelSelfAttention(
+        #     hidden_size=config.hidden_size,
+        #     num_attention_heads=config.num_attention_heads,
+        #     dropout_prob=config.attention_probs_dropout_prob,
+        #     output_parallel=True,
+        #     init_method=normal_init_method(mean=0.0,
+        #                                    std=config.initializer_range))
+        self.output = BertSelfOutput(config)
+
+    def forward(self, input_tensor, attention_mask):
+        self_output = self.self(input_tensor, attention_mask)
+        attention_output = self.output(self_output, input_tensor)
+        return attention_output
+
+
+class BertIntermediate(nn.Module):
+
+    def __init__(self, config):
+        super(BertIntermediate, self).__init__()
+        self.dense = nn.Linear(
+            config.hidden_size, config.intermediate_size, bias=True)
+        # self.dense = mpu.ColumnParallelLinear(
+        #     input_size=config.hidden_size,
+        #     output_size=config.intermediate_size,
+        #     bias=True,
+        #     gather_output=False,
+        #     stride=1,
+        #     init_method=normal_init_method(mean=0.0,
+        #                                    std=config.initializer_range))
+        self.intermediate_act_fn = ACT2FN[config.hidden_act] \
+            if isinstance(config.hidden_act, str) else config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class BertOutput(nn.Module):
+
+    def __init__(self, config):
+        super(BertOutput, self).__init__()
+        if hasattr(config, 'deep_init') and config.deep_init:
+            init_method = scaled_init_method(
+                mean=0.0,
+                std=config.initializer_range,
+                num_layers=config.num_hidden_layers)
+        else:
+            init_method = normal_init_method(  # noqa
+                mean=0.0, std=config.initializer_range)
+        self.dense = nn.Linear(
+            config.intermediate_size, config.hidden_size, bias=True)
+        # self.dense = mpu.RowParallelLinear(
+        #     input_size=config.intermediate_size,
+        #     output_size=config.hidden_size,
+        #     bias=True,
+        #     input_is_parallel=True,
+        #     stride=1,
+        #     init_method=init_method)
+        self.fp32_layernorm = config.fp32_layernorm
+        self.LayerNorm = BertLayerNorm(
+            config.hidden_size, eps=config.layernorm_epsilon)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        ln_input = hidden_states + input_tensor
+        previous_type = ln_input.type()
+        if self.fp32_layernorm:
+            ln_input = ln_input.float()
+        hidden_states = self.LayerNorm(ln_input)
+        if self.fp32_layernorm:
+            hidden_states = hidden_states.type(previous_type)
+        return hidden_states
+
+
+class BertLayer(nn.Module):
+
+    def __init__(self, config):
+        super(BertLayer, self).__init__()
+        self.attention = BertAttention(config)
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+
+    def forward(self, hidden_states, attention_mask):
+        attention_output = self.attention(hidden_states, attention_mask)
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class BertEncoder(nn.Module):
+
+    def __init__(self, config):
+        super(BertEncoder, self).__init__()
+        # layer = BertLayer(config)
+        # self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
+        self.layer = nn.ModuleList(
+            [BertLayer(config) for _ in range(config.num_hidden_layers)])
+
+    # def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True):
+    #     all_encoder_layers = []
+    #     for layer_module in self.layer:
+    #         hidden_states = layer_module(hidden_states, attention_mask)
+    #         if output_all_encoded_layers:
+    #             all_encoder_layers.append(hidden_states)
+    #     if not output_all_encoded_layers:
+    #         all_encoder_layers.append(hidden_states)
+    #     return all_encoder_layers
+    def forward(self,
+                hidden_states,
+                attention_mask,
+                output_all_encoded_layers=True,
+                checkpoint_activations=False):
+        all_encoder_layers = []
+
+        def custom(start, end):
+
+            def custom_forward(*inputs):
+                layers = self.layer[start:end]
+                x_ = inputs[0]
+                for layer in layers:
+                    x_ = layer(x_, inputs[1])
+                return x_
+
+            return custom_forward
+
+        if checkpoint_activations:
+            l = 0  # noqa
+            num_layers = len(self.layer)
+            chunk_length = 1  # math.ceil(math.sqrt(num_layers))
+            while l < num_layers:
+                hidden_states = mpu.checkpoint(
+                    custom(l, l + chunk_length), hidden_states,
+                    attention_mask * 1)
+                l += chunk_length  # noqa
+            # decoder layers
+        else:
+            for i, layer_module in enumerate(self.layer):
+                hidden_states = layer_module(hidden_states, attention_mask)
+
+                if output_all_encoded_layers:
+                    all_encoder_layers.append(hidden_states)
+
+        if not output_all_encoded_layers or checkpoint_activations:
+            all_encoder_layers.append(hidden_states)
+        return all_encoder_layers
+
+
+class BertPooler(nn.Module):
+
+    def __init__(self, config):
+        super(BertPooler, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class BertPredictionHeadTransform(nn.Module):
+
+    def __init__(self, config):
+        super(BertPredictionHeadTransform, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.transform_act_fn = ACT2FN[config.hidden_act] \
+            if isinstance(config.hidden_act, str) else config.hidden_act
+        self.LayerNorm = BertLayerNorm(
+            config.hidden_size, eps=config.layernorm_epsilon)
+        self.fp32_layernorm = config.fp32_layernorm
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        previous_type = hidden_states.type()
+        if self.fp32_layernorm:
+            hidden_states = hidden_states.float()
+        hidden_states = self.LayerNorm(hidden_states)
+        if self.fp32_layernorm:
+            hidden_states = hidden_states.type(previous_type)
+        return hidden_states
+
+
+class BertLMPredictionHead(nn.Module):
+
+    def __init__(self, config, bert_model_embedding_weights):
+        super(BertLMPredictionHead, self).__init__()
+        self.transform = BertPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(
+            bert_model_embedding_weights.size(1),
+            bert_model_embedding_weights.size(0),
+            bias=False)
+        # self.decoder_weight = bert_model_embedding_weights
+        # self.bias = nn.Parameter(torch.zeros(bert_model_embedding_weights.size(0)))
+        # self.bias.model_parallel = True
+        self.fp32_embedding = config.fp32_embedding
+        self.fp32_layernorm = config.fp32_layernorm
+
+        def convert_to_type(tensor):
+            if self.fp32_embedding:
+                return tensor.half()
+            else:
+                return tensor
+
+        self.type_converter = convert_to_type
+        self.converted = False
+
+    def forward(self, hidden_states):
+        if not self.converted:
+            self.converted = True
+            if self.fp32_embedding:
+                self.transform.half()
+                if self.fp32_layernorm:
+                    self.transform.LayerNorm.float()
+        hidden_states = self.transform(self.type_converter(hidden_states))
+        hidden_states = self.decoder(hidden_states) + self.bias
+        # hidden_states = mpu.copy_to_model_parallel_region(hidden_states)
+        # hidden_states = F.linear(self.type_converter(hidden_states),
+        #                          self.type_converter(self.decoder_weight),
+        #                          self.type_converter(self.bias))
+        return hidden_states
+
+
+class BertOnlyMLMHead(nn.Module):
+
+    def __init__(self, config, bert_model_embedding_weights):
+        super(BertOnlyMLMHead, self).__init__()
+        self.predictions = BertLMPredictionHead(config,
+                                                bert_model_embedding_weights)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class BertOnlyNSPHead(nn.Module):
+
+    def __init__(self, config):
+        super(BertOnlyNSPHead, self).__init__()
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, pooled_output):
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return seq_relationship_score
+
+
+class BertPreTrainingHeads(nn.Module):
+
+    def __init__(self, config, bert_model_embedding_weights):
+        super(BertPreTrainingHeads, self).__init__()
+        self.predictions = BertLMPredictionHead(config,
+                                                bert_model_embedding_weights)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        for p in self.seq_relationship.parameters():
+            if p is None:
+                continue
+            pooled_output = pooled_output.type_as(p)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+
+class PreTrainedBertModel(nn.Module):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+
+    def __init__(self, config, *inputs, **kwargs):
+        super(PreTrainedBertModel, self).__init__()
+        if not isinstance(config, BertConfig):
+            raise ValueError(
+                'Parameter config in `{}(config)` should be an instance of class `BertConfig`. '
+                'To create a model from a Google pretrained model use '
+                '`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`'.format(
+                    self.__class__.__name__, self.__class__.__name__))
+        self.config = config
+
+    def init_bert_weights(self, module):
+        """ Initialize the weights.
+        """
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, BertLayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+    @classmethod
+    def from_pretrained(cls,
+                        pretrained_model_name,
+                        state_dict=None,
+                        cache_dir=None,
+                        fp32_layernorm=False,
+                        fp32_embedding=False,
+                        layernorm_epsilon=1e-12,
+                        fp32_tokentypes=False,
+                        *inputs,
+                        **kwargs):
+        """
+        Instantiate a PreTrainedBertModel from a pre-trained model file or a pytorch state dict.
+        Download and cache the pre-trained model file if needed.
+
+        Params:
+            pretrained_model_name: either:
+                - a str with the name of a pre-trained model to load selected in the list of:
+                    . `bert-base-uncased`
+                    . `bert-large-uncased`
+                    . `bert-base-cased`
+                    . `bert-large-cased`
+                    . `bert-base-multilingual-uncased`
+                    . `bert-base-multilingual-cased`
+                    . `bert-base-chinese`
+                - a path or url to a pretrained model archive containing:
+                    . `bert_config.json` a configuration file for the model
+                    . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance
+            cache_dir: an optional path to a folder in which the pre-trained models will be cached.
+            state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
+            *inputs, **kwargs: additional input for the specific Bert class
+                (ex: num_labels for BertForSequenceClassification)
+        """ # noqa
+        if pretrained_model_name in PRETRAINED_MODEL_ARCHIVE_MAP:
+            archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name]
+        else:
+            archive_file = pretrained_model_name
+        # redirect to the cache, if necessary
+        try:
+            resolved_archive_file = cached_path(
+                archive_file, cache_dir=cache_dir)
+        except FileNotFoundError:
+            logger.error(
+                "Model name '{}' was not found in model name list ({}). "
+                "We assumed '{}' was a path or url but couldn't find any file "
+                'associated to this path or url.'.format(
+                    pretrained_model_name,
+                    ', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()),
+                    archive_file))
+            return None
+        if resolved_archive_file == archive_file:
+            logger.info('loading archive file {}'.format(archive_file))
+        else:
+            logger.info('loading archive file {} from cache at {}'.format(
+                archive_file, resolved_archive_file))
+        tempdir = None
+        if os.path.isdir(resolved_archive_file):
+            serialization_dir = resolved_archive_file
+        else:
+            # Extract archive to temp dir
+            tempdir = tempfile.mkdtemp()
+            logger.info('extracting archive file {} to temp dir {}'.format(
+                resolved_archive_file, tempdir))
+            with tarfile.open(resolved_archive_file, 'r:gz') as archive:
+                archive.extractall(tempdir)
+            serialization_dir = tempdir
+        # Load config
+        config_file = os.path.join(serialization_dir, CONFIG_NAME)
+        config = BertConfig.from_json_file(config_file)
+        config.fp32_layernorm = fp32_layernorm
+        config.fp32_embedding = fp32_embedding
+        config.layernorm_epsilon = layernorm_epsilon
+        config.fp32_tokentypes = fp32_tokentypes
+        logger.info('Model config {}'.format(config))
+        # Instantiate model.
+        model = cls(config, *inputs, **kwargs)
+        if state_dict is None:
+            weights_path = os.path.join(serialization_dir, WEIGHTS_NAME)
+            state_dict = torch.load(weights_path)
+
+        old_keys = []
+        new_keys = []
+        for key in state_dict.keys():
+            new_key = None
+            if 'gamma' in key:
+                new_key = key.replace('gamma', 'weight')
+            if 'beta' in key:
+                new_key = key.replace('beta', 'bias')
+            if new_key:
+                old_keys.append(key)
+                new_keys.append(new_key)
+        for old_key, new_key in zip(old_keys, new_keys):
+            state_dict[new_key] = state_dict.pop(old_key)
+
+        missing_keys = []
+        unexpected_keys = []
+        error_msgs = []
+        # copy state_dict so _load_from_state_dict can modify it
+        metadata = getattr(state_dict, '_metadata', None)
+        state_dict = state_dict.copy()
+        if metadata is not None:
+            state_dict._metadata = metadata
+
+        def load(module, prefix=''):
+            local_metadata = {} if metadata is None else metadata.get(
+                prefix[:-1], {})
+            module._load_from_state_dict(state_dict, prefix, local_metadata,
+                                         True, missing_keys, unexpected_keys,
+                                         error_msgs)
+            for name, child in module._modules.items():
+                if child is not None:
+                    load(child, prefix + name + '.')
+
+        load(model, prefix='' if hasattr(model, 'bert') else 'bert.')
+        if len(missing_keys) > 0:
+            print('Weights of {} not initialized from pretrained model: {}'.
+                  format(model.__class__.__name__, missing_keys))
+        if len(unexpected_keys) > 0:
+            print('Weights from pretrained model not used in {}: {}'.format(
+                model.__class__.__name__, unexpected_keys))
+        if tempdir:
+            # Clean up temp dir
+            shutil.rmtree(tempdir)
+        return model
+
+
+class BertModel(PreTrainedBertModel):
+    """BERT model ("Bidirectional Embedding Representations from a Transformer").
+
+    Params:
+        config: a BertConfig class instance with the configuration to build a new model
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`.
+
+    Outputs: Tuple of (encoded_layers, pooled_output)
+        `encoded_layers`: controled by `output_all_encoded_layers` argument:
+            - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
+                of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each
+                encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
+            - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
+                to the last attention block of shape [batch_size, sequence_length, hidden_size],
+        `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
+            classifier pretrained on top of the hidden state associated to the first character of the
+            input (`CLF`) to train on the Next-Sentence task (see BERT's paper).
+
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+    config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+    model = modeling.BertModel(config=config)
+    all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
+    ```
+    """ # noqa
+
+    def __init__(self, config):
+        super(BertModel, self).__init__(config)
+        self.embeddings = BertEmbeddings(config)
+        self.encoder = BertEncoder(config)
+        self.pooler = BertPooler(config)
+        self.apply(self.init_bert_weights)
+
+    def forward(self,
+                input_ids,
+                token_type_ids=None,
+                attention_mask=None,
+                output_all_encoded_layers=True,
+                checkpoint_activations=False):
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+
+        # We create a 3D attention mask from a 2D tensor mask.
+        # Sizes are [batch_size, 1, 1, to_seq_length]
+        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+        # this attention mask is more simple than the triangular masking of causal attention
+        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(
+            dtype=next(self.encoder.parameters()).dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+
+        embedding_output = self.embeddings(input_ids, token_type_ids)
+        encoded_layers = self.encoder(
+            embedding_output,
+            extended_attention_mask,
+            output_all_encoded_layers=output_all_encoded_layers,
+            checkpoint_activations=checkpoint_activations)
+        sequence_output = encoded_layers[-1]
+        for p in self.pooler.parameters():
+            if p is None:
+                continue
+            sequence_output = sequence_output.type_as(p)
+            break
+        pooled_output = self.pooler(sequence_output)
+        if not output_all_encoded_layers or checkpoint_activations:
+            encoded_layers = encoded_layers[-1]
+        return encoded_layers, pooled_output
+
+
+class BertForPreTraining(PreTrainedBertModel):
+    """BERT model with pre-training heads.
+    This module comprises the BERT model followed by the two pre-training heads:
+        - the masked language modeling head, and
+        - the next sentence classification head.
+
+    Params:
+        config: a BertConfig class instance with the configuration to build a new model.
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `masked_lm_labels`: masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
+            with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
+            is only computed for the labels set in [0, ..., vocab_size]
+        `next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size]
+            with indices selected in [0, 1].
+            0 => next sentence is the continuation, 1 => next sentence is a random sentence.
+
+    Outputs:
+        if `masked_lm_labels` and `next_sentence_label` are not `None`:
+            Outputs the total_loss which is the sum of the masked language modeling loss and the next
+            sentence classification loss.
+        if `masked_lm_labels` or `next_sentence_label` is `None`:
+            Outputs a tuple comprising
+            - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and
+            - the next sentence classification logits of shape [batch_size, 2].
+
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+    model = BertForPreTraining(config)
+    masked_lm_logits_scores, seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+
+    def __init__(self, config):
+        super(BertForPreTraining, self).__init__(config)
+        self.bert = BertModel(config)
+        self.cls = BertPreTrainingHeads(
+            config, self.bert.embeddings.word_embeddings.weight)
+        self.apply(self.init_bert_weights)
+
+    def forward(self,
+                input_ids,
+                token_type_ids=None,
+                attention_mask=None,
+                masked_lm_labels=None,
+                next_sentence_label=None,
+                checkpoint_activations=False):
+        sequence_output, pooled_output = self.bert(
+            input_ids,
+            token_type_ids,
+            attention_mask,
+            output_all_encoded_layers=False,
+            checkpoint_activations=checkpoint_activations)
+        prediction_scores, seq_relationship_score = self.cls(
+            sequence_output, pooled_output)
+
+        if masked_lm_labels is not None and next_sentence_label is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            masked_lm_loss = loss_fct(
+                prediction_scores.view(-1, self.config.vocab_size).float(),
+                masked_lm_labels.view(-1))
+            next_sentence_loss = loss_fct(
+                seq_relationship_score.view(-1, 2).float(),
+                next_sentence_label.view(-1))
+            total_loss = masked_lm_loss + next_sentence_loss
+            return total_loss
+        else:
+            return prediction_scores, seq_relationship_score
+
+
+class BertForMaskedLM(PreTrainedBertModel):
+    """BERT model with the masked language modeling head.
+    This module comprises the BERT model followed by the masked language modeling head.
+
+    Params:
+        config: a BertConfig class instance with the configuration to build a new model.
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `masked_lm_labels`: masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
+            with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
+            is only computed for the labels set in [0, ..., vocab_size]
+
+    Outputs:
+        if `masked_lm_labels` is  not `None`:
+            Outputs the masked language modeling loss.
+        if `masked_lm_labels` is `None`:
+            Outputs the masked language modeling logits of shape [batch_size, sequence_length, vocab_size].
+
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+    model = BertForMaskedLM(config)
+    masked_lm_logits_scores = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+
+    def __init__(self, config):
+        super(BertForMaskedLM, self).__init__(config)
+        self.bert = BertModel(config)
+        self.cls = BertOnlyMLMHead(config,
+                                   self.bert.embeddings.word_embeddings.weight)
+        self.apply(self.init_bert_weights)
+
+    def forward(self,
+                input_ids,
+                token_type_ids=None,
+                attention_mask=None,
+                masked_lm_labels=None,
+                checkpoint_activations=False):
+        sequence_output, _ = self.bert(
+            input_ids,
+            token_type_ids,
+            attention_mask,
+            output_all_encoded_layers=False,
+            checkpoint_activations=checkpoint_activations)
+        prediction_scores = self.cls(sequence_output)
+
+        if masked_lm_labels is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            masked_lm_loss = loss_fct(
+                prediction_scores.view(-1, self.config.vocab_size),
+                masked_lm_labels.view(-1))
+            return masked_lm_loss
+        else:
+            return prediction_scores
+
+
+class BertForNextSentencePrediction(PreTrainedBertModel):
+    """BERT model with next sentence prediction head.
+    This module comprises the BERT model followed by the next sentence classification head.
+
+    Params:
+        config: a BertConfig class instance with the configuration to build a new model.
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size]
+            with indices selected in [0, 1].
+            0 => next sentence is the continuation, 1 => next sentence is a random sentence.
+
+    Outputs:
+        if `next_sentence_label` is not `None`:
+            Outputs the total_loss which is the sum of the masked language modeling loss and the next
+            sentence classification loss.
+        if `next_sentence_label` is `None`:
+            Outputs the next sentence classification logits of shape [batch_size, 2].
+
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+    model = BertForNextSentencePrediction(config)
+    seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+
+    def __init__(self, config):
+        super(BertForNextSentencePrediction, self).__init__(config)
+        self.bert = BertModel(config)
+        self.cls = BertOnlyNSPHead(config)
+        self.apply(self.init_bert_weights)
+
+    def forward(self,
+                input_ids,
+                token_type_ids=None,
+                attention_mask=None,
+                next_sentence_label=None,
+                checkpoint_activations=False):
+        _, pooled_output = self.bert(
+            input_ids,
+            token_type_ids,
+            attention_mask,
+            output_all_encoded_layers=False,
+            checkpoint_activations=checkpoint_activations)
+        seq_relationship_score = self.cls(pooled_output)
+
+        if next_sentence_label is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            next_sentence_loss = loss_fct(
+                seq_relationship_score.view(-1, 2),
+                next_sentence_label.view(-1))
+            return next_sentence_loss
+        else:
+            return seq_relationship_score
+
+
+class BertForSequenceClassification(PreTrainedBertModel):
+    """BERT model for classification.
+    This module is composed of the BERT model with a linear layer on top of
+    the pooled output.
+
+    Params:
+        `config`: a BertConfig class instance with the configuration to build a new model.
+        `num_labels`: the number of classes for the classifier. Default = 2.
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
+            with indices selected in [0, ..., num_labels].
+
+    Outputs:
+        if `labels` is not `None`:
+            Outputs the CrossEntropy classification loss of the output with the labels.
+        if `labels` is `None`:
+            Outputs the classification logits of shape [batch_size, num_labels].
+
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+    num_labels = 2
+
+    model = BertForSequenceClassification(config, num_labels)
+    logits = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+
+    def __init__(self, config, num_labels=2):
+        super(BertForSequenceClassification, self).__init__(config)
+        self.num_labels = num_labels
+        self.bert = BertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, num_labels)
+        self.apply(self.init_bert_weights)
+
+    def forward(self,
+                input_ids,
+                token_type_ids=None,
+                attention_mask=None,
+                labels=None,
+                checkpoint_activations=False):
+        _, pooled_output = self.bert(
+            input_ids,
+            token_type_ids,
+            attention_mask,
+            output_all_encoded_layers=False,
+            checkpoint_activations=checkpoint_activations)
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            return loss
+        else:
+            return logits
+
+
+class BertForMultipleChoice(PreTrainedBertModel):
+    """BERT model for multiple choice tasks.
+    This module is composed of the BERT model with a linear layer on top of
+    the pooled output.
+
+    Params:
+        `config`: a BertConfig class instance with the configuration to build a new model.
+        `num_choices`: the number of classes for the classifier. Default = 2.
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length]
+            with the token types indices selected in [0, 1]. Type 0 corresponds to a `sentence A`
+            and type 1 corresponds to a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
+            with indices selected in [0, ..., num_choices].
+
+    Outputs:
+        if `labels` is not `None`:
+            Outputs the CrossEntropy classification loss of the output with the labels.
+        if `labels` is `None`:
+            Outputs the classification logits of shape [batch_size, num_labels].
+
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]], [[12, 16, 42], [14, 28, 57]]])
+    input_mask = torch.LongTensor([[[1, 1, 1], [1, 1, 0]],[[1,1,0], [1, 0, 0]]])
+    token_type_ids = torch.LongTensor([[[0, 0, 1], [0, 1, 0]],[[0, 1, 1], [0, 0, 1]]])
+    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+    num_choices = 2
+
+    model = BertForMultipleChoice(config, num_choices)
+    logits = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+
+    def __init__(self, config):
+        super(BertForMultipleChoice, self).__init__(config)
+        self.bert = BertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+        self.apply(self.init_bert_weights)
+
+    def forward(self,
+                input_ids,
+                token_type_ids=None,
+                attention_mask=None,
+                labels=None,
+                checkpoint_activations=False):
+        batch_size, num_choices = input_ids.shape[:2]
+        flat_input_ids = input_ids.reshape(-1, input_ids.size(-1))
+        flat_token_type_ids = token_type_ids.reshape(-1,
+                                                     token_type_ids.size(-1))
+        flat_attention_mask = attention_mask.reshape(-1,
+                                                     attention_mask.size(-1))
+        _, pooled_output = self.bert(
+            flat_input_ids,
+            flat_token_type_ids,
+            flat_attention_mask,
+            output_all_encoded_layers=False,
+            checkpoint_activations=checkpoint_activations)
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.reshape(-1, num_choices)
+
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+            return loss
+        else:
+            return reshaped_logits
+
+
+class BertForTokenClassification(PreTrainedBertModel):
+    """BERT model for token-level classification.
+    This module is composed of the BERT model with a linear layer on top of
+    the full hidden state of the last layer.
+
+    Params:
+        `config`: a BertConfig class instance with the configuration to build a new model.
+        `num_labels`: the number of classes for the classifier. Default = 2.
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
+            with indices selected in [0, ..., num_labels].
+
+    Outputs:
+        if `labels` is not `None`:
+            Outputs the CrossEntropy classification loss of the output with the labels.
+        if `labels` is `None`:
+            Outputs the classification logits of shape [batch_size, sequence_length, num_labels].
+
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+    num_labels = 2
+
+    model = BertForTokenClassification(config, num_labels)
+    logits = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+
+    def __init__(self, config, num_labels=2):
+        super(BertForTokenClassification, self).__init__(config)
+        self.num_labels = num_labels
+        self.bert = BertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, num_labels)
+        # self.classifier = mpu.RowParallelLinear(
+        #     input_size=config.hidden_size,
+        #     output_size=num_labels,
+        #     bias=True,
+        #     input_is_parallel=True,
+        #     stride=1,
+        #     init_method=normal_init_method(mean=0.0,
+        #                                    std=config.initializer_range))
+        self.apply(self.init_bert_weights)
+
+    def forward(self,
+                input_ids,
+                token_type_ids=None,
+                attention_mask=None,
+                labels=None,
+                checkpoint_activations=False):
+        sequence_output, _ = self.bert(
+            input_ids,
+            token_type_ids,
+            attention_mask,
+            output_all_encoded_layers=False,
+            checkpoint_activations=checkpoint_activations)
+        with mpu.get_cuda_rng_tracker().fork():
+            sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            return loss
+        else:
+            return logits
+
+
+class BertForQuestionAnswering(PreTrainedBertModel):
+    """BERT model for Question Answering (span extraction).
+    This module is composed of the BERT model with a linear layer on top of
+    the sequence output that computes start_logits and end_logits
+
+    Params:
+        `config`: a BertConfig class instance with the configuration to build a new model.
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `start_positions`: position of the first token for the labeled span: torch.LongTensor of shape [batch_size].
+            Positions are clamped to the length of the sequence and position outside of the sequence are not taken
+            into account for computing the loss.
+        `end_positions`: position of the last token for the labeled span: torch.LongTensor of shape [batch_size].
+            Positions are clamped to the length of the sequence and position outside of the sequence are not taken
+            into account for computing the loss.
+
+    Outputs:
+        if `start_positions` and `end_positions` are not `None`:
+            Outputs the total_loss which is the sum of the CrossEntropy loss for the start and end token positions.
+        if `start_positions` or `end_positions` is `None`:
+            Outputs a tuple of start_logits, end_logits which are the logits respectively for the start and end
+            position tokens of shape [batch_size, sequence_length].
+
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+    model = BertForQuestionAnswering(config)
+    start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+
+    def __init__(self, config):
+        super(BertForQuestionAnswering, self).__init__(config)
+        self.bert = BertModel(config)
+        # TODO check with Google if it's normal there is no dropout on the token classifier of SQuAD in the TF version
+        # self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.qa_outputs = nn.Linear(config.hidden_size, 2)
+        # self.qa_outputs = mpu.RowParallelLinear(
+        #     input_size=config.hidden_size,
+        #     output_size=2,
+        #     bias=True,
+        #     input_is_parallel=True,
+        #     stride=1,
+        #     init_method=normal_init_method(mean=0.0,
+        #                                    std=config.initializer_range))
+        self.apply(self.init_bert_weights)
+
+    def forward(self,
+                input_ids,
+                token_type_ids=None,
+                attention_mask=None,
+                start_positions=None,
+                end_positions=None,
+                checkpoint_activations=False):
+        sequence_output, _ = self.bert(
+            input_ids,
+            token_type_ids,
+            attention_mask,
+            output_all_encoded_layers=False,
+            checkpoint_activations=checkpoint_activations)
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)
+
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions.clamp_(0, ignored_index)
+            end_positions.clamp_(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+            return total_loss
+        else:
+            return start_logits, end_logits
diff --git a/modelscope/models/nlp/mglm/model/modeling_glm.py b/modelscope/models/nlp/mglm/model/modeling_glm.py
new file mode 100644
index 00000000..80f61cef
--- /dev/null
+++ b/modelscope/models/nlp/mglm/model/modeling_glm.py
@@ -0,0 +1,245 @@
+# Modified by Zhipu.AI
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""GPT-2 model."""
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from modelscope.models.nlp.mglm import mpu
+from modelscope.models.nlp.mglm.model.prompt import PromptSpell
+from modelscope.models.nlp.mglm.utils import print_rank_0
+
+
+def init_method_normal(std=0.02):
+    """Init method based on normal distribution.
+
+    This is only used for embeddings. The transformer has its
+    own initializer.
+    """
+
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=0.0, std=std)
+
+    return init_
+
+
+class GLMModel(torch.nn.Module):
+    """GLM Language model.
+
+    The output of the forward method are the logits (parallel or
+    serial depending on the `parallel_output` flag.
+    """
+
+    def __init__(
+        self,
+        num_layers,
+        vocab_size,
+        hidden_size,
+        num_attention_heads,
+        embedding_dropout_prob,
+        attention_dropout_prob,
+        output_dropout_prob,
+        max_sequence_length,
+        max_memory_length,
+        checkpoint_activations,
+        checkpoint_num_layers=1,
+        parallel_output=True,
+        relative_encoding=False,
+        block_position_encoding=False,
+        output_predict=True,
+        spell_length=None,
+        spell_func='lstm',
+        attention_scale=1.0,
+    ):
+
+        super(GLMModel, self).__init__()
+
+        self.parallel_output = parallel_output
+        self.output_predict = output_predict
+        self.hidden_size = hidden_size
+
+        init_method = init_method_normal(std=0.02)
+
+        # Word embeddings (parallel).
+        self.word_embeddings = mpu.VocabParallelEmbedding(
+            vocab_size, hidden_size, init_method=init_method)
+
+        # Transformer
+        self.transformer = mpu.GPT2ParallelTransformer(
+            num_layers,
+            hidden_size,
+            num_attention_heads,
+            max_sequence_length,
+            max_memory_length,
+            embedding_dropout_prob,
+            attention_dropout_prob,
+            output_dropout_prob,
+            checkpoint_activations,
+            checkpoint_num_layers,
+            attention_scale=attention_scale,
+            relative_encoding=relative_encoding,
+            block_position_encoding=block_position_encoding)
+        if spell_length is not None:
+            self.prompt_spell = PromptSpell(spell_length, self.hidden_size,
+                                            spell_func)
+
+    def freeze_transformer(self, tune_prefix_layers=None):
+        log_str = 'Freeze transformer'
+        self.word_embeddings.requires_grad_(False)
+        self.transformer.requires_grad_(False)
+        if tune_prefix_layers is not None:
+            log_str += f' tune {tune_prefix_layers} prefix layers'
+            for i in range(tune_prefix_layers):
+                self.transformer.layers[i].requires_grad_(True)
+        print_rank_0(log_str)
+
+    def forward(self,
+                input_ids,
+                position_ids,
+                attention_mask,
+                *mems,
+                return_memory=False,
+                detach_memory=True,
+                prompt_pos=None):
+        # Embeddings.
+        batch_size = input_ids.size(0)
+        words_embeddings = self.word_embeddings(input_ids)
+        embeddings = words_embeddings
+        if prompt_pos is not None:
+            embeddings = embeddings.clone()
+            prompt_embeds = self.prompt_spell()
+            batch_index = torch.arange(
+                batch_size, device=input_ids.device).unsqueeze(1)
+            embeddings[batch_index, prompt_pos] = prompt_embeds
+        # Transformer.
+        transformer_output = self.transformer(
+            embeddings,
+            position_ids,
+            attention_mask,
+            mems,
+            return_memory=return_memory,
+            detach_memory=detach_memory)
+        logits, hidden_layers = transformer_output
+        outputs = hidden_layers
+
+        if self.output_predict:
+            # Parallel logits.
+            logits_parallel = mpu.copy_to_model_parallel_region(logits)
+            logits_parallel = F.linear(logits_parallel,
+                                       self.word_embeddings.weight)
+
+            if self.parallel_output:
+                return (logits_parallel, *outputs)
+
+            return (mpu.gather_from_model_parallel_region(logits_parallel),
+                    *outputs)
+        else:
+            return (logits, *outputs)
+
+
+class EncoderDecoder(torch.nn.Module):
+    """Seq2Seq Transformer Model
+    The output of the forward method are the logits (parallel or serial depending on the `parallel_output` flag).
+    """
+
+    def __init__(self,
+                 num_layers,
+                 vocab_size,
+                 hidden_size,
+                 num_attention_heads,
+                 embedding_dropout_prob,
+                 attention_dropout_prob,
+                 output_dropout_prob,
+                 max_sequence_length,
+                 max_memory_length,
+                 checkpoint_activations,
+                 checkpoint_num_layers=1,
+                 parallel_output=True,
+                 output_predict=True):
+        super(EncoderDecoder, self).__init__()
+
+        self.parallel_output = parallel_output
+        self.output_predict = output_predict
+
+        init_method = init_method_normal(std=0.02)
+
+        # Word embeddings (parallel).
+        self.word_embeddings = mpu.VocabParallelEmbedding(
+            vocab_size, hidden_size, init_method=init_method)
+
+        # Transformer
+        self.encoder = mpu.GPT2ParallelTransformer(
+            num_layers, hidden_size, num_attention_heads, max_sequence_length,
+            max_memory_length, embedding_dropout_prob, attention_dropout_prob,
+            output_dropout_prob, checkpoint_activations, checkpoint_num_layers)
+        self.decoder = mpu.GPT2ParallelTransformer(
+            num_layers,
+            hidden_size,
+            num_attention_heads,
+            max_sequence_length,
+            max_memory_length,
+            embedding_dropout_prob,
+            attention_dropout_prob,
+            output_dropout_prob,
+            checkpoint_activations,
+            checkpoint_num_layers,
+            use_decoder_layer=True)
+
+    def forward(self, source_ids, target_ids, source_position_ids,
+                target_position_ids, source_mask, target_mask):
+        # Embeddings.
+        source_embeddings = self.word_embeddings(source_ids)
+        target_embeddings = self.word_embeddings(target_ids)
+
+        # Transformer.
+        encoder_output, _ = self.encoder(source_embeddings,
+                                         source_position_ids, source_mask)
+        decoder_output, _ = self.decoder(target_embeddings,
+                                         target_position_ids, target_mask)
+        if self.output_predict:
+            # Parallel logits.
+            output_parallel = mpu.copy_to_model_parallel_region(decoder_output)
+            logits_parallel = F.linear(output_parallel,
+                                       self.word_embeddings.weight)
+
+            if self.parallel_output:
+                return (logits_parallel, )
+
+            return (mpu.gather_from_model_parallel_region(logits_parallel), )
+        else:
+            return (decoder_output, )
+
+
+def glm_get_params_for_weight_decay_optimization(module):
+    weight_decay_params = {'params': []}
+    no_weight_decay_params = {'params': [], 'weight_decay': 0.0}
+    for module_ in module.modules():
+        if isinstance(module_, (mpu.LayerNorm, torch.nn.LayerNorm)):
+            no_weight_decay_params['params'].extend([
+                p for p in list(module_._parameters.values())
+                if p is not None and p.requires_grad
+            ])
+        else:
+            weight_decay_params['params'].extend([
+                p for n, p in list(module_._parameters.items())
+                if p is not None and p.requires_grad and n != 'bias'
+            ])
+            no_weight_decay_params['params'].extend([
+                p for n, p in list(module_._parameters.items())
+                if p is not None and p.requires_grad and n == 'bias'
+            ])
+
+    return weight_decay_params, no_weight_decay_params
diff --git a/modelscope/models/nlp/mglm/model/prompt.py b/modelscope/models/nlp/mglm/model/prompt.py
new file mode 100644
index 00000000..a29ceda0
--- /dev/null
+++ b/modelscope/models/nlp/mglm/model/prompt.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2022 Zhipu.AI
+
+import random
+
+import torch
+
+
+class PromptSpell(torch.nn.Module):
+
+    def __init__(self, spell_length, hidden_size, spell_func):
+        super(PromptSpell, self).__init__()
+        self.spell_length = spell_length
+        self.hidden_size = hidden_size
+        self.spell_embeddings = torch.nn.Embedding(self.spell_length,
+                                                   self.hidden_size)
+        self.spell_func = spell_func
+        if self.spell_func == 'lstm':
+            self.lstm_head = torch.nn.LSTM(
+                input_size=self.hidden_size,
+                hidden_size=self.hidden_size,
+                num_layers=2,
+                # dropout=self.lstm_dropout,
+                bidirectional=True,
+                batch_first=True)  # .to(torch.device("cuda"))
+            self.mlp_head = torch.nn.Sequential(
+                torch.nn.Linear(2 * self.hidden_size, self.hidden_size),
+                torch.nn.ReLU(),
+                torch.nn.Linear(self.hidden_size, self.hidden_size))
+        elif self.spell_func == 'mlp':
+            self.mlp_head = torch.nn.Sequential(
+                torch.nn.Linear(self.hidden_size, self.hidden_size),
+                torch.nn.ReLU(),
+                torch.nn.Linear(self.hidden_size, self.hidden_size))
+        elif self.spell_func != 'none':
+            raise NotImplementedError('Prompt function ' + self.spell_func)
+
+    def init_embedding(self, word_embeddings=None, task_tokens=None):
+        num_words = 5000
+        with torch.no_grad():
+            for i in range(self.spell_length):
+                rand_token = random.randrange(num_words)
+                if task_tokens is None:
+                    target_embedding = word_embeddings[rand_token]
+                else:
+                    word_embedding = word_embeddings[rand_token]
+                    task_token = random.choice(task_tokens)
+                    task_embedding = word_embeddings[task_token]
+                    ratio = random.random()
+                    target_embedding = word_embedding * ratio + task_embedding * (
+                        1 - ratio)
+                self.spell_embeddings.weight.data[i] = target_embedding
+
+    def forward(self):
+        prompt_embeds = self.spell_embeddings.weight.unsqueeze(0)
+        if self.spell_func == 'lstm':
+            prompt_embeds = self.lstm_head(prompt_embeds)[0]
+        if self.spell_func == 'lstm' or self.spell_func == 'mlp':
+            prompt_embeds = self.mlp_head(prompt_embeds)
+        return prompt_embeds
diff --git a/modelscope/models/nlp/mglm/mpu/__init__.py b/modelscope/models/nlp/mglm/mpu/__init__.py
new file mode 100755
index 00000000..8cca4e2c
--- /dev/null
+++ b/modelscope/models/nlp/mglm/mpu/__init__.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Model parallel utility interface."""
+
+from .cross_entropy import vocab_parallel_cross_entropy
+from .data import broadcast_data
+from .grads import clip_grad_norm
+from .initialize import (destroy_model_parallel, get_data_parallel_group,
+                         get_data_parallel_rank, get_data_parallel_world_size,
+                         get_model_parallel_group, get_model_parallel_rank,
+                         get_model_parallel_src_rank,
+                         get_model_parallel_world_size,
+                         initialize_model_parallel,
+                         model_parallel_is_initialized)
+from .layers import (ColumnParallelLinear, ParallelEmbedding,
+                     RowParallelLinear, VocabParallelEmbedding)
+from .mappings import (copy_to_model_parallel_region,
+                       gather_from_model_parallel_region,
+                       reduce_from_model_parallel_region,
+                       scatter_to_model_parallel_region)
+from .random import (checkpoint, get_cuda_rng_tracker,
+                     model_parallel_cuda_manual_seed,
+                     partition_activations_in_checkpoint)
+from .transformer import (BertParallelSelfAttention,
+                          BertParallelTransformerLayer,
+                          GPT2ParallelTransformer, LayerNorm)
diff --git a/modelscope/models/nlp/mglm/mpu/cross_entropy.py b/modelscope/models/nlp/mglm/mpu/cross_entropy.py
new file mode 100644
index 00000000..2ebcf7a8
--- /dev/null
+++ b/modelscope/models/nlp/mglm/mpu/cross_entropy.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from .initialize import (get_model_parallel_group, get_model_parallel_rank,
+                         get_model_parallel_world_size)
+from .utils import VocabUtility
+
+
+class _VocabParallelCrossEntropy(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, vocab_parallel_logits, target):
+
+        # Copy so the input remains unchanged.
+        logits = vocab_parallel_logits.clone()
+        # Maximum value along vocab dimension across all GPUs.
+        logits_max = torch.max(logits, dim=-1)[0]
+        torch.distributed.all_reduce(
+            logits_max,
+            op=torch.distributed.ReduceOp.MAX,
+            group=get_model_parallel_group())
+        # Subtract the maximum value.
+        logits.sub_(logits_max.unsqueeze(dim=-1))
+        # Sum of exponential of logits along vocab dimension across all GPUs.
+        exp_logits = logits.exp()
+        sum_exp_logits = exp_logits.sum(dim=-1)
+        torch.distributed.all_reduce(
+            sum_exp_logits,
+            op=torch.distributed.ReduceOp.SUM,
+            group=get_model_parallel_group())
+
+        # Get the partition's vocab indecies
+        get_vocab_range = VocabUtility.vocab_range_from_per_partition_vocab_size
+        partition_vocab_size = vocab_parallel_logits.size()[-1]
+        rank = get_model_parallel_rank()
+        world_size = get_model_parallel_world_size()
+        vocab_start_index, vocab_end_index = get_vocab_range(
+            partition_vocab_size, rank, world_size)
+
+        # Create a mask of valid vocab ids (1 means it needs to be masked).
+        target_mask = (target < vocab_start_index) | (
+            target >= vocab_end_index)
+        masked_target = target.clone() - vocab_start_index
+        masked_target[target_mask] = 0
+
+        # Get predicted-logits = logits[target].
+        # For Simplicity, we convert logits to a 2-D tensor with size
+        # [*, partition-vocab-size] and target to a 1-D tensor of size [*].
+        logits_2d = logits.view(-1, partition_vocab_size)
+        masked_target_1d = masked_target.view(-1)
+        arange_1d = torch.arange(
+            start=0, end=logits_2d.size()[0], device=logits_2d.device)
+        predicted_logits_1d = logits_2d[arange_1d, masked_target_1d]
+        predicted_logits = predicted_logits_1d.view_as(target)
+        predicted_logits[target_mask] = 0.0
+        # All reduce is needed to get the chunks from other GPUs.
+        torch.distributed.all_reduce(
+            predicted_logits,
+            op=torch.distributed.ReduceOp.SUM,
+            group=get_model_parallel_group())
+
+        # Loss = log(sum(exp(logits))) - predicted-logit.
+        loss = torch.log(sum_exp_logits) - predicted_logits
+
+        # Store softmax, target-mask and masked-target for backward pass.
+        exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1))
+        ctx.save_for_backward(exp_logits, target_mask, masked_target_1d)
+
+        return loss
+
+    @staticmethod
+    def backward(ctx, grad_output):
+
+        # Retreive tensors from the forward path.
+        softmax, target_mask, masked_target_1d = ctx.saved_tensors
+
+        # All the inputs have softmax as thier gradient.
+        grad_input = softmax
+        # For simplicity, work with the 2D gradient.
+        partition_vocab_size = softmax.size()[-1]
+        grad_2d = grad_input.view(-1, partition_vocab_size)
+
+        # Add the gradient from matching classes.
+        arange_1d = torch.arange(
+            start=0, end=grad_2d.size()[0], device=grad_2d.device)
+        grad_2d[arange_1d,
+                masked_target_1d] -= (1.0 - target_mask.view(-1).float())
+
+        # Finally elementwise multiplication with the output gradients.
+        grad_input.mul_(grad_output.unsqueeze(dim=-1))
+
+        return grad_input, None
+
+
+def vocab_parallel_cross_entropy(vocab_parallel_logits, target):
+    """Helper function for the cross entropy."""
+    return _VocabParallelCrossEntropy.apply(vocab_parallel_logits, target)
diff --git a/modelscope/models/nlp/mglm/mpu/data.py b/modelscope/models/nlp/mglm/mpu/data.py
new file mode 100644
index 00000000..6f595f0f
--- /dev/null
+++ b/modelscope/models/nlp/mglm/mpu/data.py
@@ -0,0 +1,117 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from .initialize import (get_model_parallel_group, get_model_parallel_rank,
+                         get_model_parallel_src_rank)
+
+_MAX_DATA_DIM = 5
+
+
+def _check_data_types(keys, data, target_dtype):
+    """Check that all the keys have the same target data type."""
+    for key in keys:
+        assert data[key].dtype == target_dtype, '{} has data type {} which '\
+            'is different than {}'.format(key, data[key].dtype, target_dtype)
+
+
+def _build_key_size_numel_dictionaries(keys, data):
+    """Build the size on rank 0 and broadcast."""
+    max_dim = _MAX_DATA_DIM
+    sizes = [0 for _ in range(max_dim) for _ in keys]
+
+    # Pack the sizes on rank zero.
+    if get_model_parallel_rank() == 0:
+        offset = 0
+        for key in keys:
+            assert data[key].dim(
+            ) < max_dim, 'you should increase MAX_DATA_DIM'
+            size = data[key].size()
+            for i, s in enumerate(size):
+                sizes[i + offset] = s
+            offset += max_dim
+
+    # Move to GPU and broadcast.
+    sizes_cuda = torch.cuda.LongTensor(sizes)
+    torch.distributed.broadcast(
+        sizes_cuda,
+        get_model_parallel_src_rank(),
+        group=get_model_parallel_group())
+
+    # Move back to cpu and unpack.
+    sizes_cpu = sizes_cuda.cpu()
+    key_size = {}
+    key_numel = {}
+    total_numel = 0
+    offset = 0
+    for key in keys:
+        i = 0
+        size = []
+        numel = 1
+        while sizes_cpu[offset + i] > 0:
+            this_size = sizes_cpu[offset + i]
+            size.append(this_size)
+            numel *= this_size
+            i += 1
+        key_size[key] = size
+        key_numel[key] = numel
+        total_numel += numel
+        offset += max_dim
+
+    return key_size, key_numel, total_numel
+
+
+def broadcast_data(keys, data, datatype):
+    """Broadcast data from rank zero of each model parallel group to the
+    members of the same model parallel group.
+
+    Arguments:
+        keys: list of keys in the data disctionary to be broadcasted
+        data: data dictionary of string keys and cpu tensor values.
+        datatype: torch data type of all tensors in data associated
+                  with keys.
+    """
+    # Build (key, size) and (key, number of elements) dictionaries along
+    # with the total number of elements on all ranks.
+    key_size, key_numel, total_numel = _build_key_size_numel_dictionaries(
+        keys, data)
+
+    # Pack on rank zero.
+    if get_model_parallel_rank() == 0:
+        # Check that all keys have the same data type.
+        _check_data_types(keys, data, datatype)
+        # Flatten the data associated with the keys
+        flatten_data = torch.cat(
+            [data[key].contiguous().view(-1) for key in keys], dim=0).cuda()
+    else:
+        flatten_data = torch.empty(
+            total_numel, device=torch.cuda.current_device(), dtype=datatype)
+
+    # Boradcast
+    torch.distributed.broadcast(
+        flatten_data,
+        get_model_parallel_src_rank(),
+        group=get_model_parallel_group())
+
+    # Unpack
+    output = {}
+    offset = 0
+    for key in keys:
+        size = key_size[key]
+        numel = key_numel[key]
+        output[key] = flatten_data.narrow(0, offset, numel).view(size)
+        offset += numel
+
+    return output
diff --git a/modelscope/models/nlp/mglm/mpu/grads.py b/modelscope/models/nlp/mglm/mpu/grads.py
new file mode 100644
index 00000000..a7dc6c5c
--- /dev/null
+++ b/modelscope/models/nlp/mglm/mpu/grads.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Parts of the code here are adapted from PyTorch
+# repo: https://github.com/pytorch/pytorch
+
+import torch
+from torch._six import inf
+
+from .initialize import get_model_parallel_group, get_model_parallel_rank
+
+
+def clip_grad_norm(parameters, max_norm, norm_type=2):
+    """Clips gradient norm of an iterable of parameters.
+
+    This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and
+    added functionality to handle model parallel parameters. Note that
+    the gradients are modified in place.
+
+    Arguments:
+        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
+            single Tensor that will have gradients normalized
+        max_norm (float or int): max norm of the gradients
+        norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
+            infinity norm.
+
+    Returns:
+        Total norm of the parameters (viewed as a single vector).
+    """
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+    parameters = list(filter(lambda p: p.grad is not None, parameters))
+    max_norm = float(max_norm)
+    norm_type = float(norm_type)
+    if norm_type == inf:
+        total_norm = max(p.grad.data.abs().max() for p in parameters)
+        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
+        # Take max across all GPUs.
+        torch.distributed.all_reduce(
+            total_norm_cuda,
+            op=torch.distributed.ReduceOp.MAX,
+            group=get_model_parallel_group())
+        total_norm = total_norm_cuda[0].item()
+    else:
+        total_norm = 0
+        for p in parameters:
+            if p.model_parallel or (get_model_parallel_rank() == 0):
+                param_norm = p.grad.data.norm(norm_type)
+                total_norm += param_norm.item()**norm_type
+        # Sum across all model parallel GPUs.
+        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
+        torch.distributed.all_reduce(
+            total_norm_cuda,
+            op=torch.distributed.ReduceOp.SUM,
+            group=get_model_parallel_group())
+        total_norm = total_norm_cuda[0].item()**(1. / norm_type)
+    clip_coef = max_norm / (total_norm + 1e-6)
+    if clip_coef < 1:
+        for p in parameters:
+            p.grad.data.mul_(clip_coef)
+    return total_norm
diff --git a/modelscope/models/nlp/mglm/mpu/initialize.py b/modelscope/models/nlp/mglm/mpu/initialize.py
new file mode 100644
index 00000000..33f8dbda
--- /dev/null
+++ b/modelscope/models/nlp/mglm/mpu/initialize.py
@@ -0,0 +1,130 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Model and data parallel groups."""
+
+import torch
+
+from .utils import ensure_divisibility
+
+# Model parallel group that the current rank belongs to.
+_MODEL_PARALLEL_GROUP = None
+# Data parallel group that the current rank belongs to.
+_DATA_PARALLEL_GROUP = None
+
+
+def initialize_model_parallel(model_parallel_size_):
+    """
+    Initialize model data parallel groups.
+
+    Arguments:
+        model_parallel_size: number of GPUs used to parallelize model.
+
+    Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we
+    use 2 GPUs to parallelize the model. The present function will
+    create 4 model parallel groups and 2 data parallel grous as:
+        4 model parallel groups:
+            [g0, g1], [g2, g3], [g4, g5], [g6, g7]
+        2 data parallel groups:
+            [g0, g2, g4, g6], [g1, g3, g5, g7]
+    Note that for efficiency, the caller should make sure adjacent ranks
+    are on the same DGX box. For example if we are using 2 DGX-1 boxes
+    with a total of 16 GPUs, rank 0 to 7 belong to the first box and
+    ranks 8 to 15 belong to the second box.
+    """
+    if torch.distributed.get_rank() == 0:
+        print('> initializing model parallel with size {}'.format(
+            model_parallel_size_))
+    # Get world size and rank. Ensure some consistencies.
+    assert torch.distributed.is_initialized()
+    world_size = torch.distributed.get_world_size()
+    model_parallel_size = min(model_parallel_size_, world_size)
+    ensure_divisibility(world_size, model_parallel_size)
+    rank = torch.distributed.get_rank()
+
+    # Build the data parallel groups.
+    global _DATA_PARALLEL_GROUP
+    assert _DATA_PARALLEL_GROUP is None, \
+        'data parallel group is already initialized'
+    for i in range(model_parallel_size):
+        ranks = range(i, world_size, model_parallel_size)
+        group = torch.distributed.new_group(ranks)
+        if i == (rank % model_parallel_size):
+            _DATA_PARALLEL_GROUP = group
+
+    # Build the model parallel groups.
+    global _MODEL_PARALLEL_GROUP
+    assert _MODEL_PARALLEL_GROUP is None, \
+        'model parallel group is already initialized'
+    for i in range(world_size // model_parallel_size):
+        ranks = range(i * model_parallel_size, (i + 1) * model_parallel_size)
+        group = torch.distributed.new_group(ranks)
+        if i == (rank // model_parallel_size):
+            _MODEL_PARALLEL_GROUP = group
+
+
+def model_parallel_is_initialized():
+    """Check if model and data parallel groups are initialized."""
+    if _MODEL_PARALLEL_GROUP is None or _DATA_PARALLEL_GROUP is None:
+        return False
+    return True
+
+
+def get_model_parallel_group():
+    """Get the model parallel group the caller rank belongs to."""
+    assert _MODEL_PARALLEL_GROUP is not None, \
+        'model parallel group is not initialized'
+    return _MODEL_PARALLEL_GROUP
+
+
+def get_data_parallel_group():
+    """Get the data parallel group the caller rank belongs to."""
+    assert _DATA_PARALLEL_GROUP is not None, \
+        'data parallel group is not initialized'
+    return _DATA_PARALLEL_GROUP
+
+
+def get_model_parallel_world_size():
+    """Return world size for the model parallel group."""
+    return torch.distributed.get_world_size(group=get_model_parallel_group())
+
+
+def get_model_parallel_rank():
+    """Return my rank for the model parallel group."""
+    return torch.distributed.get_rank(group=get_model_parallel_group())
+
+
+def get_model_parallel_src_rank():
+    """Calculate the global rank corresponding to a local rank zeor
+    in the model parallel group."""
+    global_rank = torch.distributed.get_rank()
+    local_world_size = get_model_parallel_world_size()
+    return (global_rank // local_world_size) * local_world_size
+
+
+def get_data_parallel_world_size():
+    """Return world size for the data parallel group."""
+    return torch.distributed.get_world_size(group=get_data_parallel_group())
+
+
+def get_data_parallel_rank():
+    """Return my rank for the data parallel group."""
+    return torch.distributed.get_rank(group=get_data_parallel_group())
+
+
+def destroy_model_parallel():
+    """Set the groups to none."""
+    global _MODEL_PARALLEL_GROUP
+    _MODEL_PARALLEL_GROUP = None
+    global _DATA_PARALLEL_GROUP
+    _DATA_PARALLEL_GROUP = None
diff --git a/modelscope/models/nlp/mglm/mpu/layers.py b/modelscope/models/nlp/mglm/mpu/layers.py
new file mode 100644
index 00000000..4eb94b50
--- /dev/null
+++ b/modelscope/models/nlp/mglm/mpu/layers.py
@@ -0,0 +1,357 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Parts of the code here are adapted from PyTorch
+# repo: https://github.com/pytorch/pytorch
+
+import math
+
+import torch
+import torch.nn.functional as F
+import torch.nn.init as init
+from apex.normalization.fused_layer_norm import FusedLayerNorm as LayerNorm
+from torch.nn.parameter import Parameter
+
+from .initialize import get_model_parallel_rank, get_model_parallel_world_size
+from .mappings import (copy_to_model_parallel_region,
+                       gather_from_model_parallel_region,
+                       reduce_from_model_parallel_region,
+                       scatter_to_model_parallel_region)
+from .random import get_cuda_rng_tracker
+from .utils import VocabUtility, divide, split_tensor_along_last_dim
+
+
+def _initialize_affine_weight(weight,
+                              output_size,
+                              input_size,
+                              per_partition_size,
+                              partition_dim,
+                              init_method,
+                              stride=1,
+                              return_master_weight=False):
+    """Initialize affine weight for model parallel.
+
+    Build the master weight on all processes and scatter
+    the relevant chunk."""
+    # If we only use 1 process for model parallelism, bypass scatter.
+    world_size = get_model_parallel_world_size()
+    if world_size == 1:
+        init_method(weight)
+        if return_master_weight:
+            return weight
+        return None
+
+    # Initialize master weight
+    master_weight = torch.empty(
+        output_size, input_size, dtype=weight.dtype, requires_grad=False)
+    init_method(master_weight)
+
+    # Split and copy
+    per_partition_per_stride_size = divide(per_partition_size, stride)
+    weight_list = torch.split(
+        master_weight, per_partition_per_stride_size, dim=partition_dim)
+    rank = get_model_parallel_rank()
+    my_weight_list = weight_list[rank::world_size]
+
+    with torch.no_grad():
+        torch.cat(my_weight_list, dim=partition_dim, out=weight)
+    if return_master_weight:
+        return master_weight
+    return None
+
+
+class VocabParallelEmbedding(torch.nn.Module):
+    """Embedding parallelized in the vocabulary dimension.
+
+    This is mainly adapted from torch.nn.Embedding and all the default
+    values are kept.
+    Arguments:
+        num_embeddings: vocabulary size.
+        embedding_dim: size of hidden state.
+        init_method: method to initialize weights.
+    """
+
+    def __init__(self,
+                 num_embeddings,
+                 embedding_dim,
+                 init_method=init.xavier_normal_):
+        super(VocabParallelEmbedding, self).__init__()
+        # Keep the input dimensions.
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        # Set the detauls for compatibility.
+        self.padding_idx = None
+        self.max_norm = None
+        self.norm_type = 2.
+        self.scale_grad_by_freq = False
+        self.sparse = False
+        self._weight = None
+        # Divide the weight matrix along the vocaburaly dimension.
+        self.vocab_start_index, self.vocab_end_index = \
+            VocabUtility.vocab_range_from_global_vocab_size(
+                self.num_embeddings, get_model_parallel_rank(),
+                get_model_parallel_world_size())
+        self.num_embeddings_per_partition = self.vocab_end_index - self.vocab_start_index  # noqa
+
+        # Allocate weights.
+        self.weight = Parameter(
+            torch.Tensor(self.num_embeddings_per_partition,
+                         self.embedding_dim))
+        self.weight.model_parallel = True
+        # And initialize.
+        _initialize_affine_weight(self.weight, self.num_embeddings,
+                                  self.embedding_dim,
+                                  self.num_embeddings_per_partition, 0,
+                                  init_method)
+
+    def forward(self, input_):
+        # Build the mask.
+        input_mask = (input_ < self.vocab_start_index) | \
+                     (input_ >= self.vocab_end_index)
+        # Mask the input.
+        masked_input = input_.clone() - self.vocab_start_index
+        masked_input[input_mask] = 0
+        # Get the embeddings.
+        output_parallel = F.embedding(masked_input, self.weight,
+                                      self.padding_idx, self.max_norm,
+                                      self.norm_type, self.scale_grad_by_freq,
+                                      self.sparse)
+        # Mask the output embedding.
+        output_parallel[input_mask, :] = 0.0
+        # Reduce across all the model parallel GPUs.
+        output = reduce_from_model_parallel_region(output_parallel)
+        return output
+
+
+class ParallelEmbedding(torch.nn.Module):
+    """Embedding parallelized in the embedding dimension.
+
+    This is mainly adapted from torch.nn.Embedding and all the default
+    values are kept.
+    Arguments:
+        num_embeddings: vocabulary size.
+        embedding_dim: size of hidden state.
+        init_method: method to initialize weights.
+    """
+
+    def __init__(self,
+                 num_embeddings,
+                 embedding_dim,
+                 init_method=init.xavier_normal_,
+                 keep_master_weight_for_test=False):
+        super(ParallelEmbedding, self).__init__()
+        # Keep the input dimensions.
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        # Set some detauls for compatibility.
+        self.padding_idx = None
+        self.max_norm = None
+        self.norm_type = 2.
+        self.scale_grad_by_freq = False
+        self.sparse = False
+        self._weight = None
+        # Divide the weight matrix along the embedding dimension.
+        world_size = get_model_parallel_world_size()
+        self.embedding_dim_per_partition = divide(self.embedding_dim,
+                                                  world_size)
+
+        # Allocate weights.
+        self.weight = Parameter(
+            torch.Tensor(self.num_embeddings,
+                         self.embedding_dim_per_partition))
+        self.weight.model_parallel = True
+        # And initialize.
+        _initialize_affine_weight(
+            self.weight,
+            self.num_embeddings,
+            self.embedding_dim,
+            self.embedding_dim_per_partition,
+            1,
+            init_method,
+            stride=1,
+            return_master_weight=False)
+
+    def forward(self, input_):
+        input_parallel = copy_to_model_parallel_region(input_)
+        output_parallel = F.embedding(input_parallel, self.weight,
+                                      self.padding_idx, self.max_norm,
+                                      self.norm_type, self.scale_grad_by_freq,
+                                      self.sparse)
+        output = gather_from_model_parallel_region(output_parallel)
+        return output
+
+
+class ColumnParallelLinear(torch.nn.Module):
+    """Linear layer with column parallelism.
+
+    The linear layer is defined as Y = XA + b. A is parallelized along
+    its second dimension as A = [A_1, ..., A_p].
+
+    Arguments:
+        input_size: first dimension of matrix A.
+        output_size: second dimension of matrix A.
+        bias: If true, add bias
+        gather_output: If true, call all-gether on output and make Y avaiable
+                       to all GPUs, otherwise, every GPU will have its output
+                       which is Y_i = XA_i
+        init_method: method to initialize weights. Note that bias is always set
+                     to zero.
+        stride: For the strided linear layers.
+        keep_master_weight_for_test: This was added for testing and should be
+                                     set to False. It returns the master weights
+                                     used for initialization.
+    """
+
+    def __init__(self,
+                 input_size,
+                 output_size,
+                 bias=True,
+                 gather_output=True,
+                 init_method=init.xavier_normal_,
+                 stride=1,
+                 keep_master_weight_for_test=False):
+        super(ColumnParallelLinear, self).__init__()
+
+        # Keep input parameters
+        self.input_size = input_size
+        self.output_size = output_size
+        self.gather_output = gather_output
+        # Divide the weight matrix along the last dimension.
+        world_size = get_model_parallel_world_size()
+        self.output_size_per_partition = divide(output_size, world_size)
+
+        # Parameters.
+        # Note: torch.nn.functional.linear performs XA^T + b and as a result
+        # we allocate the transpose.
+        self.weight = Parameter(
+            torch.Tensor(self.output_size_per_partition, self.input_size))
+        self.weight.model_parallel = True
+        if bias:
+            self.bias = Parameter(torch.Tensor(self.output_size_per_partition))
+            self.bias.model_parallel = True
+            # Always initialize bias to zero.
+            with torch.no_grad():
+                self.bias.zero_()
+        else:
+            self.register_parameter('bias', None)
+
+        # Initialize weight.
+        self.master_weight = _initialize_affine_weight(
+            self.weight,
+            self.output_size,
+            self.input_size,
+            self.output_size_per_partition,
+            0,
+            init_method,
+            stride=stride,
+            return_master_weight=keep_master_weight_for_test)
+
+    def forward(self, input_):
+        # Set up backprop all-reduce.
+        input_parallel = copy_to_model_parallel_region(input_)
+        # Matrix multiply.
+        output_parallel = F.linear(input_parallel, self.weight, self.bias)
+        if self.gather_output:
+            # All-gather across the partitions.
+            output = gather_from_model_parallel_region(output_parallel)
+        else:
+            output = output_parallel
+        return output
+
+
+class RowParallelLinear(torch.nn.Module):
+    """Linear layer with row parallelism.
+
+    The linear layer is defined as Y = XA + b. A is parallelized along
+    its first dimension and X along its second dimension as:
+               -   -
+              | A_1 |
+              | .   |
+          A = | .   |        X = [X_1, ..., X_p]
+              | .   |
+              | A_p |
+               -   -
+    Arguments:
+        input_size: first dimension of matrix A.
+        output_size: second dimension of matrix A.
+        bias: If true, add bias. Note that bias is not parallelized.
+        input_is_parallel: If true, we assume that the input is already
+                           split across the GPUs and we do not split
+                           again.
+        init_method: method to initialize weights. Note that bias is always set
+                     to zero.
+        stride: For the strided linear layers.
+        keep_master_weight_for_test: This was added for testing and should be
+                                     set to False. It returns the master weights
+                                     used for initialization.
+    """
+
+    def __init__(self,
+                 input_size,
+                 output_size,
+                 bias=True,
+                 input_is_parallel=False,
+                 init_method=init.xavier_normal_,
+                 stride=1,
+                 keep_master_weight_for_test=False):
+        super(RowParallelLinear, self).__init__()
+
+        # Keep input parameters
+        self.input_size = input_size
+        self.output_size = output_size
+        self.input_is_parallel = input_is_parallel
+        # Divide the weight matrix along the last dimension.
+        world_size = get_model_parallel_world_size()
+        self.input_size_per_partition = divide(input_size, world_size)
+
+        # Parameters.
+        # Note: torch.nn.functional.linear performs XA^T + b and as a result
+        # we allocate the transpose.
+        self.weight = Parameter(
+            torch.Tensor(self.output_size, self.input_size_per_partition))
+        self.weight.model_parallel = True
+        if bias:
+            self.bias = Parameter(torch.Tensor(self.output_size))
+            # Always initialize bias to zero.
+            with torch.no_grad():
+                self.bias.zero_()
+        else:
+            self.register_parameter('bias', None)
+
+        # Initialize weight.
+        self.master_weight = _initialize_affine_weight(
+            self.weight,
+            self.output_size,
+            self.input_size,
+            self.input_size_per_partition,
+            1,
+            init_method,
+            stride=stride,
+            return_master_weight=keep_master_weight_for_test)
+
+    def forward(self, input_):
+        # Set up backprop all-reduce.
+        if self.input_is_parallel:
+            input_parallel = input_
+        else:
+            input_parallel = scatter_to_model_parallel_region(input_)
+        # Matrix multiply.
+        output_parallel = F.linear(input_parallel, self.weight)
+        # All-reduce across all the partitions.
+        output_ = reduce_from_model_parallel_region(output_parallel)
+        if self.bias is not None:
+            output = output_ + self.bias
+        else:
+            output = output_
+        return output
diff --git a/modelscope/models/nlp/mglm/mpu/mappings.py b/modelscope/models/nlp/mglm/mpu/mappings.py
new file mode 100644
index 00000000..b3056dd7
--- /dev/null
+++ b/modelscope/models/nlp/mglm/mpu/mappings.py
@@ -0,0 +1,144 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from .initialize import get_model_parallel_group
+from .utils import split_tensor_along_last_dim
+
+
+def _reduce(input_):
+    """All-reduce the the input tensor across model parallel group."""
+    group = get_model_parallel_group()
+
+    # Bypass the function if we are using only 1 GPU.
+    if torch.distributed.get_world_size(group=group) == 1:
+        return input_
+
+    # All-reduce.
+    torch.distributed.all_reduce(input_, group=group)
+
+    return input_
+
+
+def _split(input_):
+    """Split the tensor along its last dimension and keep the
+    corresponding slice."""
+    group = get_model_parallel_group()
+
+    # Bypass the function if we are using only 1 GPU.
+    if torch.distributed.get_world_size(group=group) == 1:
+        return input_
+
+    # Split along last dimension.
+    world_size = torch.distributed.get_world_size(group=group)
+    input_list = split_tensor_along_last_dim(input_, world_size)
+
+    # Note: torch.split does not create contiguous tensors by default.
+    rank = torch.distributed.get_rank(group=group)
+    output = input_list[rank].contiguous()
+
+    return output
+
+
+def _gather(input_):
+    """Gather tensors and concatinate along the last dimension."""
+    group = get_model_parallel_group()
+
+    # Bypass the function if we are using only 1 GPU.
+    if torch.distributed.get_world_size(group=group) == 1:
+        return input_
+
+    # Size and dimension.
+    last_dim = input_.dim() - 1
+    rank = torch.distributed.get_rank(group=group)
+    world_size = torch.distributed.get_world_size(group=group)
+
+    tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
+    tensor_list[rank] = input_
+    torch.distributed.all_gather(tensor_list, input_, group=group)
+
+    # Note: torch.cat already creates a contiguous tensor.
+    output = torch.cat(tensor_list, dim=last_dim).contiguous()
+
+    return output
+
+
+class _CopyToModelParallelRegion(torch.autograd.Function):
+    """Pass the input to the model parallel region."""
+
+    @staticmethod
+    def forward(ctx, input_):
+        return input_
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _reduce(grad_output)
+
+
+class _ReduceFromModelParallelRegion(torch.autograd.Function):
+    """All-redcue the input from the model parallel region."""
+
+    @staticmethod
+    def forward(ctx, input_):
+        return _reduce(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return grad_output
+
+
+class _ScatterToModelParallelRegion(torch.autograd.Function):
+    """Split the input and keep only the corresponding chuck to the rank."""
+
+    @staticmethod
+    def forward(ctx, input_):
+        return _split(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _gather(grad_output)
+
+
+class _GatherFromModelParallelRegion(torch.autograd.Function):
+    """Gather the input from model parallel region and concatinate."""
+
+    @staticmethod
+    def forward(ctx, input_):
+        return _gather(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _split(grad_output)
+
+
+# -----------------
+# Helper functions.
+# -----------------
+
+
+def copy_to_model_parallel_region(input_):
+    return _CopyToModelParallelRegion.apply(input_)
+
+
+def reduce_from_model_parallel_region(input_):
+    return _ReduceFromModelParallelRegion.apply(input_)
+
+
+def scatter_to_model_parallel_region(input_):
+    return _ScatterToModelParallelRegion.apply(input_)
+
+
+def gather_from_model_parallel_region(input_):
+    return _GatherFromModelParallelRegion.apply(input_)
diff --git a/modelscope/models/nlp/mglm/mpu/random.py b/modelscope/models/nlp/mglm/mpu/random.py
new file mode 100755
index 00000000..2cdf236d
--- /dev/null
+++ b/modelscope/models/nlp/mglm/mpu/random.py
@@ -0,0 +1,408 @@
+# Modified by Samyam Rajbhandari
+# Used to partition the activations stored for backward propagation
+# Therefore reduces the memory consumption
+
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Parts of the code here are adapted from PyTorch
+# repo: https://github.com/pytorch/pytorch
+import contextlib
+
+import torch
+import torch.distributed as dist
+from torch import _C
+from torch.cuda import _lazy_call
+from torch.cuda import device as device_ctx_manager
+
+from .initialize import (get_data_parallel_rank, get_model_parallel_group,
+                         get_model_parallel_rank,
+                         get_model_parallel_world_size)
+
+# from torch.utils.checkpoint import detach_variable
+
+PARTITION_ACTIVATIONS = False
+PA_CORRECTNESS_TEST = False
+
+
+def see_memory_usage(message, force=False):
+    if not force:
+        return
+    dist.barrier()
+    if dist.get_rank() == 0:
+        print(message)
+        print('Memory Allocated ',
+              torch.cuda.memory_allocated() / (1024 * 1024 * 1024),
+              'GigaBytes')
+        print('Max Memory Allocated ',
+              torch.cuda.max_memory_allocated() / (1024 * 1024 * 1024),
+              'GigaBytes')
+        print('Cache Allocated ',
+              torch.cuda.memory_cached() / (1024 * 1024 * 1024), 'GigaBytes')
+        print('Max cache Allocated ',
+              torch.cuda.max_memory_cached() / (1024 * 1024 * 1024),
+              'GigaBytes')
+        print(' ')
+        # input("Press Any Key To Continue ..")
+
+
+mp_rank = None  # get_model_parallel_rank()
+mp_size = None  # get_model_parallel_world_size()
+mp_group = None  # get_model_parallel_group()
+
+# Default name for the model parallel rng tracker.
+_MODEL_PARALLEL_RNG_TRACKER_NAME = 'model-parallel-rng'
+transport_stream = None
+cuda_device = None
+
+
+def detach_variable(inputs, device=None):
+    if isinstance(inputs, tuple):
+        out = []
+        for inp in inputs:
+            if not isinstance(inp, torch.Tensor):
+                out.append(inp)
+                continue
+
+            requires_grad = inp.requires_grad
+
+            if device is not None:
+                x = inp.to(device=device)
+            else:
+                x = inp
+
+            x = x.detach()
+            x.requires_grad = requires_grad
+            out.append(x)
+        return tuple(out)
+    else:
+        raise RuntimeError(
+            'Only tuple of tensors is supported. Got Unsupported input type: ',
+            type(inputs).__name__)
+
+
+def _set_cuda_rng_state(new_state, device=-1):
+    """Sets the random number generator state of the current GPU.
+
+    Argumentss:
+        new_state (torch.ByteTensor): The desired state
+    This function is adapted from PyTorch repo (torch.cuda.set_rng_state)
+    with a single change: the input state is not cloned. Cloning caused
+    major performance issues for +4 GPU cases.
+    """
+    if hasattr(_C, '_cuda_setRNGState') and callable(_C._cuda_setRNGState):
+        # older PyTorch
+        def cb():
+            with device_ctx_manager(device):
+                _C._cuda_setRNGState(new_state)
+    else:
+        # newer PyTorch
+        if device == -1:
+            device = torch.device('cuda')
+        elif isinstance(device, str):
+            device = torch.device(device)
+        elif isinstance(device, int):
+            device = torch.device('cuda', device)
+
+        def cb():
+            idx = device.index
+            if idx is None:
+                idx = torch.cuda.current_device()
+            default_generator = torch.cuda.default_generators[idx]
+            default_generator.set_state(new_state)
+
+    _lazy_call(cb)
+
+
+class CudaRNGStatesTracker:
+    """Tracker for the cuda RNG states.
+
+    Using the `add` method, a cuda rng state is initialized based on
+    the input `seed` and is assigned to `name`. Later, by forking the
+    rng state, we can perform operations and return to our starting
+    cuda state.
+    """
+
+    def __init__(self):
+        # Map from a string name to the cuda rng state.
+        self.states_ = {}
+        # Seeds are just for book keeping and ensure no seed is set twice.
+        self.seeds_ = set()
+
+    def reset(self):
+        """Set to the initial state (no tracker)."""
+        self.states_ = {}
+        self.seeds_ = set()
+
+    def get_states(self):
+        """Get rng states. Copy the dictionary so we have direct
+        pointers to the states, not just a pointer to the dictionary."""
+        states = {}
+        for name in self.states_:
+            states[name] = self.states_[name]
+        return states
+
+    def set_states(self, states):
+        """Set the rng states. For efficiency purposes, we do not check
+        the size of seed for compatibility."""
+        self.states_ = states
+
+    def add(self, name, seed):
+        """Track the rng state."""
+        # Check seed is not already used.
+        if seed in self.seeds_:
+            raise Exception('seed {} already exists'.format(seed))
+        self.seeds_.add(seed)
+        # Check that state is not already defined.
+        if name in self.states_:
+            raise Exception('cuda rng state {} already exists'.format(name))
+        # Get the current rng state.
+        orig_rng_state = torch.cuda.get_rng_state()
+        # Set the new state and store it.
+        torch.cuda.manual_seed(seed)
+        self.states_[name] = torch.cuda.get_rng_state()
+        # Reset rng state to what it was.
+        _set_cuda_rng_state(orig_rng_state)
+
+    @contextlib.contextmanager
+    def fork(self, name=_MODEL_PARALLEL_RNG_TRACKER_NAME):
+        """Fork the cuda rng state, perform operations, and exit with
+        the original state."""
+        # Check if we have added the state
+        if name not in self.states_:
+            raise Exception('cuda rng state {} is not added'.format(name))
+        # Store current rng state.
+        orig_cuda_rng_state = torch.cuda.get_rng_state()
+        # Set rng state to the desired one
+        _set_cuda_rng_state(self.states_[name])
+        # Do the stuff we wanted to do.
+        try:
+            yield
+        finally:
+            # Update the current rng state for later use.
+            self.states_[name] = torch.cuda.get_rng_state()
+            # And set the state to the original state we started with.
+            _set_cuda_rng_state(orig_cuda_rng_state)
+
+
+# RNG tracker object.
+_CUDA_RNG_STATE_TRACKER = CudaRNGStatesTracker()
+
+
+def get_cuda_rng_tracker():
+    """Get cuda rng tracker."""
+    return _CUDA_RNG_STATE_TRACKER
+
+
+def model_parallel_cuda_manual_seed(seed):
+    """Initialize model parallel cuda seed.
+
+    This function should be called after the model parallel is
+    initialized. Also, no torch.cuda.manual_seed should be called
+    after this function. Basically, this is replacement for that
+    function.
+    Two set of RNG states are tracked:
+        default state: This is for data parallelism and is the same among a
+                       set of model parallel GPUs but different across
+                       different model paralle groups. This is used for
+                       example for dropout in the non-model-parallel regions.
+        model-parallel state: This state is different among a set of model
+                              parallel GPUs, but the same across data parallel
+                              groups. This is used for example for dropout in
+                              model parallel regions.
+    """
+    # 2718 is just for fun and any POSITIVE value will work.
+    offset = seed + 2718
+    model_parallel_seed = offset + get_model_parallel_rank()
+    # Data parallel gets the original sedd.
+    data_parallel_seed = seed
+
+    if torch.distributed.get_rank() == 0:
+        print(
+            '> initializing model parallel cuda seeds on global rank {}, '
+            'model parallel rank {}, and data parallel rank {} with '
+            'model parallel seed: {} and data parallel seed: {}'.format(
+                torch.distributed.get_rank(), get_model_parallel_rank(),
+                get_data_parallel_rank(), model_parallel_seed,
+                data_parallel_seed),
+            flush=True)
+    _CUDA_RNG_STATE_TRACKER.reset()
+    # Set the default state.
+    torch.cuda.manual_seed(data_parallel_seed)
+    # and model parallel state.
+    _CUDA_RNG_STATE_TRACKER.add(_MODEL_PARALLEL_RNG_TRACKER_NAME,
+                                model_parallel_seed)
+
+
+def get_partition_start(item):
+    global mp_rank, mp_size, mp_group
+    partition_size = get_partition_size(item)
+    start = partition_size * mp_rank
+    return int(start)
+
+
+def get_partition_size(item):
+    global mp_rank, mp_size, mp_group
+    size = item.numel()
+    partition_size = size / mp_size
+    return int(partition_size)
+
+
+def get_full_inputs(tensors):
+    inputs = []
+    for i in range(int(len(tensors) / 2) - 1):
+        item = tensors[2 * i]
+        size = tensors[2 * i + 1]
+        partition_size = item.numel()
+        tensor_size = partition_size * mp_size
+        flat_tensor = torch.zeros([tensor_size],
+                                  dtype=item.dtype,
+                                  device=item.device)
+        partitions = []
+        for i in range(mp_size):
+            part_i = flat_tensor.narrow(0, partition_size * i, partition_size)
+            if i == mp_rank:
+                part_i.copy_(item)
+            partitions.append(part_i)
+        dist.all_gather(partitions, partitions[mp_rank], group=mp_group)
+        input_tensor = flat_tensor.view(list(size.numpy()))
+        item.data = input_tensor.data
+
+        inputs.append(item)
+    inputs.append(tensors[-2])
+
+    return tuple(inputs)
+
+
+class CheckpointFunction(torch.autograd.Function):
+    """This function is adapted from torch.utils.checkpoint with
+       two main changes:
+           1) torch.cuda.set_rng_state is replaced with `_set_cuda_rng_state`
+           2) the states in the model parallel tracker are also properly
+              tracked/set/reset.
+    """
+
+    @staticmethod
+    def forward(ctx, run_function, *args):
+        ctx.run_function = run_function
+        global mp_rank, mp_size, mp_group
+        if mp_rank is None:
+            mp_rank = get_model_parallel_rank()
+            mp_size = get_model_parallel_world_size()
+            mp_group = get_model_parallel_group()
+
+        global cuda_device, transport_stream, PARTITION_ACTIVATIONS
+        if cuda_device is None:
+            if dist.get_rank() == 0:
+                print(
+                    f'Partition Activations {PARTITION_ACTIVATIONS} and Correctness Check {PA_CORRECTNESS_TEST}'
+                )
+
+            cuda_device = torch.cuda.current_device()
+            # The transport stream is used to overlap the allgather communication for the activations
+            # with the computation in the backward pass
+            transport_stream = torch.cuda.Stream(device=cuda_device)
+
+        if PARTITION_ACTIVATIONS:
+            inputs = [
+                item.detach().contiguous().view(-1).narrow(
+                    0, get_partition_start(item),
+                    get_partition_size(item)).clone() for item in args[:-1]
+            ]
+            inputs.append(args[-1])
+
+        # just in case something funky is happening such as reuse of inputs
+        inputs_cuda = [item.to(cuda_device) for item in args]
+
+        # Copy the rng states.
+        ctx.fwd_cpu_rng_state = torch.get_rng_state()
+        ctx.fwd_cuda_rng_state = torch.cuda.get_rng_state()
+        ctx.fwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states()
+
+        # ctx.save_for_backward(*args)
+        with torch.no_grad():
+            outputs = run_function(*inputs_cuda)
+
+        del inputs_cuda
+
+        if PARTITION_ACTIVATIONS:
+            new_args = []
+            for arg, inp in zip(args, inputs):
+                size = torch.tensor(arg.size())
+                arg.data = inp.data
+                new_args.append(arg)
+                new_args.append(size)
+            ctx.save_for_backward(*new_args)
+        else:
+            ctx.save_for_backward(*args)
+
+        return outputs
+
+    @staticmethod
+    def backward(ctx, *args):
+        if not torch.autograd._is_checkpoint_valid():
+            raise RuntimeError('Checkpointing is not compatible with .grad(), '
+                               'please use .backward() if possible')
+
+        global cuda_device, transport_stream, PARTITION_ACTIVATIONS
+
+        if PARTITION_ACTIVATIONS:
+            with torch.cuda.stream(transport_stream):
+                inputs = get_full_inputs(ctx.saved_tensors)
+                detached_inputs = detach_variable(inputs)
+        else:
+            inputs = ctx.saved_tensors
+            detached_inputs = detach_variable(inputs)
+
+        # Store the current states.
+        bwd_cpu_rng_state = torch.get_rng_state()
+        bwd_cuda_rng_state = torch.cuda.get_rng_state()
+        bwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states()
+
+        # Set the states to what it used to be before the forward pass.
+        torch.set_rng_state(ctx.fwd_cpu_rng_state)
+        _set_cuda_rng_state(ctx.fwd_cuda_rng_state)
+        get_cuda_rng_tracker().set_states(ctx.fwd_cuda_rng_state_tracker)
+
+        if PARTITION_ACTIVATIONS:
+            current_stream = torch.cuda.current_stream()
+            current_stream.wait_stream(transport_stream)
+
+        with torch.enable_grad():
+            outputs = ctx.run_function(*detached_inputs)
+
+        # Set the states back to what it was at the start of this function.
+        torch.set_rng_state(bwd_cpu_rng_state)
+        _set_cuda_rng_state(bwd_cuda_rng_state)
+        get_cuda_rng_tracker().set_states(bwd_cuda_rng_state_tracker)
+
+        if isinstance(outputs, torch.Tensor):
+            outputs = (outputs, )
+        torch.autograd.backward(outputs, args)
+        return (None, ) + tuple(inp.grad for inp in detached_inputs)
+
+
+def checkpoint(function, *args):
+    """Checkpoint a model or part of the model.
+    This has been directly copied from torch.utils.checkpoint."""
+    return CheckpointFunction.apply(function, *args)
+
+
+def partition_activations_in_checkpoint(partition_activation):
+    global PARTITION_ACTIVATIONS
+    PARTITION_ACTIVATIONS = partition_activation
+    if dist.get_rank() == 0:
+        print(
+            f'**************Partition Activations {PARTITION_ACTIVATIONS}************'
+        )
diff --git a/modelscope/models/nlp/mglm/mpu/tests/__init__.py b/modelscope/models/nlp/mglm/mpu/tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/nlp/mglm/mpu/tests/commons.py b/modelscope/models/nlp/mglm/mpu/tests/commons.py
new file mode 100644
index 00000000..ecfd5e72
--- /dev/null
+++ b/modelscope/models/nlp/mglm/mpu/tests/commons.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import random
+
+import mpu
+import numpy
+import torch
+
+
+class IdentityLayer(torch.nn.Module):
+
+    def __init__(self, size, scale=1.0):
+        super(IdentityLayer, self).__init__()
+        self.weight = torch.nn.Parameter(scale * torch.randn(size))
+
+    def forward(self):
+        return self.weight
+
+
+def set_random_seed(seed):
+    """Set random seed for reproducability."""
+    random.seed(seed)
+    numpy.random.seed(seed)
+    torch.manual_seed(seed)
+    mpu.model_parallel_cuda_manual_seed(seed)
+
+
+def initialize_distributed(backend='nccl'):
+    """Initialize torch.distributed."""
+    # Get local rank in case it is provided.
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--local_rank',
+        type=int,
+        default=None,
+        help='local rank passed from distributed launcher')
+    args = parser.parse_args()
+    local_rank = args.local_rank
+
+    # Get rank and world size.
+    rank = int(os.getenv('RANK', '0'))
+    world_size = int(os.getenv('WORLD_SIZE', '1'))
+
+    print('> initializing torch.distributed with local rank: {}, '
+          'rank: {}, world size: {}'.format(local_rank, rank, world_size))
+
+    # Set the device id.
+    device = rank % torch.cuda.device_count()
+    if local_rank is not None:
+        device = local_rank
+    torch.cuda.set_device(device)
+
+    # Call the init process.
+    init_method = 'tcp://'
+    master_ip = os.getenv('MASTER_ADDR', 'localhost')
+    master_port = os.getenv('MASTER_PORT', '6000')
+    init_method += master_ip + ':' + master_port
+    torch.distributed.init_process_group(
+        backend=backend,
+        world_size=world_size,
+        rank=rank,
+        init_method=init_method)
+
+
+def print_separator(message):
+    torch.distributed.barrier()
+    filler_len = (78 - len(message)) // 2
+    filler = '-' * filler_len
+    string = '\n' + filler + ' {} '.format(message) + filler
+    if torch.distributed.get_rank() == 0:
+        print(string, flush=True)
+    torch.distributed.barrier()
diff --git a/modelscope/models/nlp/mglm/mpu/tests/test_cross_entropy.py b/modelscope/models/nlp/mglm/mpu/tests/test_cross_entropy.py
new file mode 100644
index 00000000..47fd1d7e
--- /dev/null
+++ b/modelscope/models/nlp/mglm/mpu/tests/test_cross_entropy.py
@@ -0,0 +1,106 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import sys
+
+import mpu
+import torch
+import torch.nn.functional as F
+from commons import (IdentityLayer, initialize_distributed, print_separator,
+                     set_random_seed)
+from mpu.cross_entropy import vocab_parallel_cross_entropy
+
+sys.path.append('../..')
+
+
+def torch_cross_entropy(batch_size, seq_length, vocab_size, logits_scale,
+                        seed):
+    set_random_seed(seed)
+    identity = IdentityLayer((batch_size, seq_length, vocab_size),
+                             scale=logits_scale).cuda()
+    logits = identity()
+    target = torch.cuda.LongTensor(size=(batch_size,
+                                         seq_length)).random_(0, vocab_size)
+    loss = F.cross_entropy(
+        logits.view(-1,
+                    logits.size()[-1]), target.view(-1),
+        reduction='none').view_as(target).mean()
+    loss.backward()
+    return loss, identity.weight.grad
+
+
+def mpu_cross_entropy(batch_size, seq_length, vocab_size, logits_scale, seed):
+    set_random_seed(seed)
+    identity = IdentityLayer((batch_size, seq_length, vocab_size),
+                             scale=logits_scale).cuda()
+    logits = identity()
+    logits_parallel = mpu.scatter_to_model_parallel_region(logits)
+    target = torch.cuda.LongTensor(size=(batch_size,
+                                         seq_length)).random_(0, vocab_size)
+    loss = vocab_parallel_cross_entropy(logits_parallel, target).mean()
+    loss.backward()
+    return loss, identity.weight.grad
+
+
+def test_cross_entropy(model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing cross entropy with model parallel size {} ...'.format(
+            model_parallel_size))
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    batch_size = 13
+    seq_length = 17
+    vocab_size_per_partition = 11
+    logits_scale = 1000.0
+    vocab_size = vocab_size_per_partition * model_parallel_size
+    seed = 1234
+
+    loss_torch, grad_torch = torch_cross_entropy(batch_size, seq_length,
+                                                 vocab_size, logits_scale,
+                                                 seed)
+    loss_mpu, grad_mpu = mpu_cross_entropy(batch_size, seq_length, vocab_size,
+                                           logits_scale, seed)
+
+    error = loss_torch.sub_(loss_mpu).abs().max()
+    print('   max error in loss on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    error = grad_torch.sub_(grad_mpu).abs().max()
+    print('   max error in grad on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+if __name__ == '__main__':
+
+    initialize_distributed()
+    world_size = torch.distributed.get_world_size()
+
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        print_separator('test cross entropy')
+        test_cross_entropy(model_parallel_size)
+        model_parallel_size *= 2
diff --git a/modelscope/models/nlp/mglm/mpu/tests/test_data.py b/modelscope/models/nlp/mglm/mpu/tests/test_data.py
new file mode 100644
index 00000000..66575300
--- /dev/null
+++ b/modelscope/models/nlp/mglm/mpu/tests/test_data.py
@@ -0,0 +1,91 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import functools
+import operator
+import sys
+
+import mpu
+import torch
+from commons import initialize_distributed, print_separator
+from mpu import data as data_utils
+
+sys.path.append('../..')
+
+
+def test_boradcast_data(model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print(
+            '> testing boradcast_data with model parallel size {} ...'.format(
+                model_parallel_size))
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    torch.manual_seed(1234 + mpu.get_data_parallel_rank())
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    key_size_t = {
+        'key1': [7, 11],
+        'key2': [8, 2, 1],
+        'key3': [13],
+        'key4': [5, 1, 2],
+        'key5': [5, 12]
+    }
+    keys = list(key_size_t.keys())
+
+    data = {}
+    data_t = {}
+    for key in key_size_t:
+        data[key] = torch.LongTensor(size=key_size_t[key]).random_(0, 1000)
+        data_t[key] = data[key].clone()
+    data['keyX'] = torch.FloatTensor(size=(5, )).random_(0, 1000)
+    data_t['keyX'] = data['keyX'].clone()
+    if mpu.get_model_parallel_rank() != 0:
+        data = None
+
+    data_utils._check_data_types(keys, data_t, torch.int64)
+    key_size, key_numel, \
+        total_numel = data_utils._build_key_size_numel_dictionaries(keys, data)
+    for key in keys:
+        assert key_size[key] == key_size_t[key]
+    total_numel_t = 0
+    for key in keys:
+        target_size = functools.reduce(operator.mul, key_size_t[key], 1)
+        assert key_numel[key] == target_size
+        total_numel_t += target_size
+    assert total_numel == total_numel_t
+
+    data_b = data_utils.broadcast_data(keys, data, torch.int64)
+    for key in keys:
+        tensor = data_t[key].cuda()
+        assert data_b[key].sub(tensor).abs().max() == 0
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+if __name__ == '__main__':
+
+    initialize_distributed()
+    world_size = torch.distributed.get_world_size()
+
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        print_separator('test test boradcast data')
+        test_boradcast_data(model_parallel_size)
+        model_parallel_size *= 2
diff --git a/modelscope/models/nlp/mglm/mpu/tests/test_initialize.py b/modelscope/models/nlp/mglm/mpu/tests/test_initialize.py
new file mode 100644
index 00000000..df62d213
--- /dev/null
+++ b/modelscope/models/nlp/mglm/mpu/tests/test_initialize.py
@@ -0,0 +1,95 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+
+import mpu
+import torch
+from commons import initialize_distributed, print_separator
+
+sys.path.append('../..')
+
+
+def test_initialize_model_parallel(model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing initialize_model_parallel with size {} ...'.format(
+            model_parallel_size))
+    model_parallel_size_ = min(model_parallel_size,
+                               torch.distributed.get_world_size())
+    assert not mpu.model_parallel_is_initialized()
+    mpu.initialize_model_parallel(model_parallel_size_)
+    assert mpu.model_parallel_is_initialized()
+
+    # Checks.
+    def check(group, world_size, rank):
+        assert world_size == torch.distributed.get_world_size(group=group)
+        assert rank == torch.distributed.get_rank(group=group)
+
+    # Model parallel.
+    world_size = model_parallel_size_
+    rank = torch.distributed.get_rank() % model_parallel_size_
+    assert world_size == mpu.get_model_parallel_world_size()
+    assert rank == mpu.get_model_parallel_rank()
+    check(mpu.get_model_parallel_group(), world_size, rank)
+
+    # Data parallel.
+    world_size = torch.distributed.get_world_size() // model_parallel_size_
+    rank = torch.distributed.get_rank() // model_parallel_size
+    assert world_size == mpu.get_data_parallel_world_size()
+    assert rank == mpu.get_data_parallel_rank()
+    check(mpu.get_data_parallel_group(), world_size, rank)
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+def test_get_model_parallel_src_rank(model_parallel_size_):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing get_model_parallel_src_rank with size {} ...'.format(
+            model_parallel_size_))
+    model_parallel_size = min(model_parallel_size_,
+                              torch.distributed.get_world_size())
+    assert not mpu.model_parallel_is_initialized()
+    mpu.initialize_model_parallel(model_parallel_size)
+    assert mpu.model_parallel_is_initialized()
+
+    # Checks
+    src_rank = torch.distributed.get_rank() - mpu.get_model_parallel_rank()
+    assert mpu.get_model_parallel_src_rank() == src_rank
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+if __name__ == '__main__':
+
+    initialize_distributed()
+    world_size = torch.distributed.get_world_size()
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        print_separator('test initialize model parallel')
+        test_initialize_model_parallel(model_parallel_size)
+        print_separator('test model parallel source rank')
+        test_get_model_parallel_src_rank(model_parallel_size)
+        model_parallel_size *= 2
diff --git a/modelscope/models/nlp/mglm/mpu/tests/test_layers.py b/modelscope/models/nlp/mglm/mpu/tests/test_layers.py
new file mode 100644
index 00000000..2dbc987a
--- /dev/null
+++ b/modelscope/models/nlp/mglm/mpu/tests/test_layers.py
@@ -0,0 +1,533 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import sys
+
+import mpu
+import torch
+import torch.nn.init as init
+from commons import initialize_distributed, print_separator, set_random_seed
+from mpu import layers
+from torch.nn.parameter import Parameter
+
+sys.path.append('../..')
+
+
+def test_parallel_embedding(model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing parallel embedding with model parallel size {} ...'.
+              format(model_parallel_size))
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    batch_size = 17
+    seq_length = 23
+    vocab_size = 48
+    hidden_size = 16
+    seed = 1236
+
+    set_random_seed(123)
+    input_data = torch.LongTensor(size=(batch_size, seq_length)).random_(
+        0, vocab_size).cuda()
+    loss_weight = torch.randn([batch_size, seq_length, hidden_size]).cuda()
+
+    set_random_seed(seed)
+    embedding_original = torch.nn.Embedding(vocab_size, hidden_size).cuda()
+
+    output = embedding_original(input_data)
+    loss_original = torch.mul(output, loss_weight).sum()
+    loss_original.backward()
+
+    set_random_seed(seed)
+    embedding_parallel = layers.ParallelEmbedding(
+        vocab_size, hidden_size, init_method=init.normal_).cuda()
+    output = embedding_parallel(input_data)
+    loss_parallel = torch.mul(output, loss_weight).sum()
+    loss_parallel.backward()
+
+    set_random_seed(seed)
+    embedding_vocab_parallel = layers.VocabParallelEmbedding(
+        vocab_size, hidden_size, init_method=init.normal_).cuda()
+    output = embedding_vocab_parallel(input_data)
+    loss_vocab_parallel = torch.mul(output, loss_weight).sum()
+    loss_vocab_parallel.backward()
+
+    torch.distributed.barrier()
+    error = loss_parallel.sub(loss_original).abs()
+    print('   error in loss (parallel) on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-12, 'error: {}'.format(error)
+
+    torch.distributed.barrier()
+    error = loss_vocab_parallel.sub(loss_original).abs()
+    print('   error in loss (vocab parallel) on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-12, 'error: {}'.format(error)
+
+    weight_grad_orig = torch.split(embedding_original.weight.grad,
+                                   hidden_size // model_parallel_size,
+                                   1)[mpu.get_model_parallel_rank()]
+    error = embedding_parallel.weight.grad.sub(weight_grad_orig).abs().max()
+    print('   error in grad (parallel) on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-12, 'error: {}'.format(error)
+
+    weight_grad_orig = torch.split(embedding_original.weight.grad,
+                                   vocab_size // model_parallel_size,
+                                   0)[mpu.get_model_parallel_rank()]
+    error = embedding_vocab_parallel.weight.grad.sub(
+        weight_grad_orig).abs().max()
+    print('   error in grad (vocab parallel) on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-12, 'error: {}'.format(error)
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+def test_initialize_affine_weight(model_parallel_size):
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    if torch.distributed.get_rank() == 0:
+        print('> testing initialize_affine_weight with model parallel '
+              'size: {}'.format(model_parallel_size))
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    seed = 12345
+    input_size_coeff = 13
+    input_size = input_size_coeff * model_parallel_size
+    output_size_coeff = 17
+    output_size = output_size_coeff * model_parallel_size
+
+    # ---------------
+    # Column parallel
+    # ---------------
+    weight = torch.empty(output_size_coeff, input_size)
+    set_random_seed(seed)
+    layers._initialize_affine_weight(weight, output_size, input_size,
+                                     output_size_coeff, 0,
+                                     torch.nn.init.normal_)
+    # Target.
+    set_random_seed(seed)
+    master_weight = torch.empty(output_size, input_size)
+    torch.nn.init.normal_(master_weight)
+    rank = mpu.get_model_parallel_rank()
+    my_weight = torch.split(
+        master_weight, output_size_coeff, dim=0)[rank].contiguous().clone()
+
+    # Compare.
+    error = weight.sub(my_weight).abs().max()
+    torch.distributed.barrier()
+    print('   column parallel max error (should be zero) on global rank '
+          '{}: {}'.format(torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    # ------------
+    # Row parallel
+    # ------------
+    weight = torch.empty(output_size, input_size_coeff)
+    set_random_seed(seed)
+    mpu.layers._initialize_affine_weight(weight, output_size, input_size,
+                                         input_size_coeff, 1,
+                                         torch.nn.init.normal_)
+    # Target.
+    set_random_seed(seed)
+    master_weight = torch.empty(output_size, input_size)
+    torch.nn.init.normal_(master_weight)
+    rank = mpu.get_model_parallel_rank()
+    my_weight = torch.split(
+        master_weight, input_size_coeff, dim=1)[rank].contiguous().clone()
+
+    # Compare.
+    error = weight.sub(my_weight).abs().max()
+    torch.distributed.barrier()
+    print('   row parallel max error (should be zero) on global rank '
+          '{}: {}'.format(torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print(' >> passed the test :-)')
+
+
+class IdentityLayer2D(torch.nn.Module):
+
+    def __init__(self, m, n):
+        super(IdentityLayer2D, self).__init__()
+        self.weight = Parameter(torch.Tensor(m, n))
+        torch.nn.init.xavier_normal_(self.weight)
+
+    def forward(self):
+        return self.weight
+
+
+def test_column_parallel_linear(model_parallel_size):
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    if torch.distributed.get_rank() == 0:
+        print('> testing ColumnParallelLinear with model parallel '
+              'size: {}'.format(model_parallel_size))
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    seed = 12345
+    set_random_seed(seed)
+    input_size_coeff = 13
+    input_size = input_size_coeff * model_parallel_size
+    output_size_coeff = 17
+    output_size = output_size_coeff * model_parallel_size
+    batch_size = 7
+
+    # Network
+    identity_layer = IdentityLayer2D(batch_size, input_size).cuda()
+    linear_layer = mpu.ColumnParallelLinear(
+        input_size, output_size, keep_master_weight_for_test=True).cuda()
+    loss_weight = torch.randn([batch_size, output_size]).cuda()
+    # Forward
+    input_ = identity_layer()
+    output = linear_layer(input_)
+    loss = torch.mul(output, loss_weight).sum()
+    # Backward
+    loss.backward()
+
+    # Values.
+    dLdY = loss_weight
+    X = identity_layer.weight
+    A = linear_layer.master_weight.cuda()
+    dLdA = torch.matmul(dLdY.t(), X)
+    dLdb = torch.matmul(torch.ones(batch_size, 1).cuda().t(), dLdY).view(-1)
+    dLdX = torch.matmul(dLdY, A)
+
+    rank = mpu.get_model_parallel_rank()
+    my_dLdA = torch.split(
+        dLdA, output_size_coeff, dim=0)[rank].contiguous().clone()
+    error = my_dLdA.sub(linear_layer.weight.grad).abs().max()
+    torch.distributed.barrier()
+    print('   error in dLdA on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    my_dLdb = torch.split(
+        dLdb, output_size_coeff, dim=0)[rank].contiguous().clone()
+    error = my_dLdb.sub(linear_layer.bias.grad).abs().max()
+    torch.distributed.barrier()
+    print('   error in dLdb on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    error = dLdX.sub(identity_layer.weight.grad).abs().max()
+    torch.distributed.barrier()
+    print('   error in dLdX on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print(' >> passed the test :-)')
+
+
+def test_row_parallel_linear(model_parallel_size):
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    if torch.distributed.get_rank() == 0:
+        print('> testing RowParallelLinear with model parallel '
+              'size: {}'.format(model_parallel_size))
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    seed = 12345
+    set_random_seed(seed)
+    input_size_coeff = 13
+    input_size = input_size_coeff * model_parallel_size
+    output_size_coeff = 17
+    output_size = output_size_coeff * model_parallel_size
+    batch_size = 7
+
+    # Network
+    identity_layer = IdentityLayer2D(batch_size, input_size).cuda()
+    linear_layer = mpu.RowParallelLinear(
+        input_size, output_size, keep_master_weight_for_test=True).cuda()
+    loss_weight = torch.randn([batch_size, output_size]).cuda()
+    # Forward
+    input_ = identity_layer()
+    output = linear_layer(input_)
+    loss = torch.mul(output, loss_weight).sum()
+    # Backward
+    loss.backward()
+
+    # Values.
+    dLdY = loss_weight
+    X = identity_layer.weight
+    A = linear_layer.master_weight.cuda()
+    dLdA = torch.matmul(dLdY.t(), X)
+    dLdb = torch.matmul(torch.ones(batch_size, 1).cuda().t(), dLdY).view(-1)
+    dLdX = torch.matmul(dLdY, A)
+
+    rank = mpu.get_model_parallel_rank()
+    my_dLdA = torch.split(
+        dLdA, input_size_coeff, dim=1)[rank].contiguous().clone()
+    error = my_dLdA.sub(linear_layer.weight.grad).abs().max()
+    torch.distributed.barrier()
+    print('   error in dLdA on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    error = dLdb.sub(linear_layer.bias.grad).abs().max()
+    torch.distributed.barrier()
+    print('   error in dLdb on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    error = dLdX.sub(identity_layer.weight.grad).abs().max()
+    torch.distributed.barrier()
+    print('   error in dLdX on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print(' >> passed the test :-)')
+
+
+class IdentityLayer3D(torch.nn.Module):
+
+    def __init__(self, m, n, k):
+        super(IdentityLayer3D, self).__init__()
+        self.weight = Parameter(torch.Tensor(m, n, k))
+        torch.nn.init.xavier_normal_(self.weight)
+
+    def forward(self):
+        return self.weight
+
+
+def parallel_self_attention(model_parallel_size, num_att_heads_per_partition,
+                            hidden_size_per_att_head, dropout_prob, batch_size,
+                            sequence_length):
+    mpu.initialize_model_parallel(model_parallel_size)
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    seed = 12345
+    set_random_seed(seed)
+
+    num_att_heads = num_att_heads_per_partition * torch.distributed.get_world_size(
+    )  # noqa
+    hidden_size = hidden_size_per_att_head * num_att_heads
+
+    # Network
+    identity_layer = IdentityLayer3D(batch_size, sequence_length,
+                                     hidden_size).cuda()
+    attention_layer = mpu.BertParallelSelfAttention(hidden_size, num_att_heads,
+                                                    dropout_prob).cuda()
+    loss_weight = torch.randn([batch_size, sequence_length,
+                               hidden_size]).cuda()
+    attention_mask = torch.randn([batch_size, 1, 1, sequence_length]).cuda()
+    # Forward
+    input_ = identity_layer()
+    output = attention_layer(input_, attention_mask)
+    loss = torch.mul(output, loss_weight).sum()
+    # Backward
+    loss.backward()
+
+    rank = mpu.get_model_parallel_rank()
+    mpu.destroy_model_parallel()
+    return rank, hidden_size, model_parallel_size, loss, \
+        attention_layer, identity_layer
+
+
+def test_parallel_self_attention(model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing ParallelSelfAttention with model parallel '
+              'size: {}'.format(model_parallel_size))
+
+    num_att_heads_per_partition = 3
+    hidden_size_per_att_head = 7
+    dropout_prob = 0.0  # has to be zero
+    batch_size = 5
+    sequence_length = 13
+
+    rank_1, hideen_size_1, model_parallel_size_1, loss_1, \
+        attention_layer_1, identity_layer_1 = parallel_self_attention(
+            1, num_att_heads_per_partition,
+            hidden_size_per_att_head, dropout_prob, batch_size, sequence_length)
+
+    rank, hidden_size, model_parallel_size, loss, \
+        attention_layer, identity_layer = parallel_self_attention(
+            model_parallel_size, num_att_heads_per_partition,
+            hidden_size_per_att_head, dropout_prob, batch_size, sequence_length)
+    assert hideen_size_1 == hidden_size
+
+    error = loss_1.sub(loss).abs().max()
+    torch.distributed.barrier()
+    print('   loss error on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 5.0e-6
+
+    my_lin_grad_list = torch.split(
+        attention_layer_1.query_key_value.weight.grad,
+        hidden_size // model_parallel_size, 0)[rank::model_parallel_size]
+    my_lin_grad = torch.cat(my_lin_grad_list, dim=0)
+    error = my_lin_grad.sub(
+        attention_layer.query_key_value.weight.grad).abs().max()
+    torch.distributed.barrier()
+    print('   weight gradient error on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 5.0e-6
+
+    error = identity_layer_1.weight.grad.sub(
+        identity_layer.weight.grad).abs().max()
+    torch.distributed.barrier()
+    print('   input gradient error on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 5.0e-6
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print(' >> passed the test :-)')
+
+
+def parallel_transformer(model_parallel_size, num_att_heads_per_partition,
+                         hidden_size_per_att_head, batch_size,
+                         sequence_length):
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    seed = 12345
+    set_random_seed(seed)
+
+    num_att_heads = num_att_heads_per_partition * torch.distributed.get_world_size(
+    )
+    hidden_size = hidden_size_per_att_head * num_att_heads
+    intermediate_size = 4 * hidden_size
+
+    # Network
+    identity_layer = IdentityLayer3D(batch_size, sequence_length,
+                                     hidden_size).cuda()
+    transformer_layer = mpu.BertParallelTransformerLayer(
+        hidden_size, intermediate_size, num_att_heads, 0.0, 0.0,
+        torch.nn.functional.relu, 1.0e-5).cuda()
+
+    loss_weight = torch.randn([batch_size, sequence_length,
+                               hidden_size]).cuda()
+    attention_mask = torch.randn([batch_size, 1, 1, sequence_length]).cuda()
+    # Forward
+    input_ = identity_layer()
+    output = transformer_layer(input_, attention_mask)
+    loss = torch.mul(output, loss_weight).sum()
+    # Backward
+    loss.backward()
+
+    rank = mpu.get_model_parallel_rank()
+    mpu.destroy_model_parallel()
+    return rank, hidden_size, model_parallel_size, loss, \
+        transformer_layer, identity_layer
+
+
+def test_parallel_transformer_layer(model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing ParallelTransformerLayer with model parallel '
+              'size: {}'.format(model_parallel_size))
+
+    num_att_heads_per_partition = 3
+    hidden_size_per_att_head = 7
+    batch_size = 5
+    sequence_length = 13
+
+    rank_1, hidden_size_1, model_parallel_size_1, loss_1, \
+        transformer_layer_1, identity_layer_1 = parallel_transformer(
+            1, num_att_heads_per_partition,
+            hidden_size_per_att_head, batch_size, sequence_length)
+
+    rank, hidden_size, model_parallel_size, loss, \
+        transformer_layer, identity_layer = parallel_transformer(
+            model_parallel_size, num_att_heads_per_partition,
+            hidden_size_per_att_head, batch_size, sequence_length)
+
+    error = loss_1.sub(loss).abs().max()
+    torch.distributed.barrier()
+    print('   loss error on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 5.0e-5, 'error: {}'.format(error)
+
+    error = identity_layer_1.weight.grad.sub(
+        identity_layer.weight.grad).abs().max()
+    torch.distributed.barrier()
+    print('   input gradient error on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 5.0e-5, 'error: {}'.format(error)
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print(' >> passed the test :-)')
+
+
+if __name__ == '__main__':
+
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+
+    initialize_distributed()
+    world_size = torch.distributed.get_world_size()
+
+    print_separator('test initialize affine weight')
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        test_initialize_affine_weight(model_parallel_size)
+        model_parallel_size *= 2
+
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        print_separator('test parallel embedding')
+        test_parallel_embedding(model_parallel_size)
+        model_parallel_size *= 2
+
+    print_separator('test column-parallel linear')
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        test_column_parallel_linear(model_parallel_size)
+        model_parallel_size *= 2
+
+    print_separator('test row-parallel linear')
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        test_row_parallel_linear(model_parallel_size)
+        model_parallel_size *= 2
+
+    print_separator('test parallel self-attention')
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        test_parallel_self_attention(model_parallel_size)
+        model_parallel_size *= 2
+
+    print_separator('test parallel transformer')
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        test_parallel_transformer_layer(model_parallel_size)
+        model_parallel_size *= 2
diff --git a/modelscope/models/nlp/mglm/mpu/tests/test_random.py b/modelscope/models/nlp/mglm/mpu/tests/test_random.py
new file mode 100644
index 00000000..55cc2351
--- /dev/null
+++ b/modelscope/models/nlp/mglm/mpu/tests/test_random.py
@@ -0,0 +1,206 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+
+import mpu
+import torch
+from commons import initialize_distributed, print_separator
+
+sys.path.append('../..')
+
+
+def test_set_cuda_rng_state(model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing set_rng_state with size {} ...'.format(
+            model_parallel_size))
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    size = 123
+    seed = 1234
+    torch.cuda.manual_seed(seed)
+    tensor = torch.cuda.FloatTensor(size)
+
+    # Get the state
+    rng_state = torch.cuda.get_rng_state()
+    rng_state_copy = rng_state.clone()
+
+    # Do some stuff.
+    for _ in range(5):
+        torch.randn(size, out=tensor)
+    result_1 = tensor.clone()
+
+    assert rng_state.sub(rng_state_copy).max() == 0
+    assert torch.cuda.get_rng_state().sub(rng_state_copy).max() > 0
+
+    # State should be different.
+    new_rng_state = torch.cuda.get_rng_state()
+    max_diff = new_rng_state.sub(rng_state).max()
+    print(
+        '   max diff in rng state (should be non-zero) on global rank {}: {}'.
+        format(torch.distributed.get_rank(), max_diff))
+    assert max_diff > 0
+
+    # Reset the rng state and do the same stuff.
+    mpu.random._set_cuda_rng_state(rng_state)
+    for _ in range(5):
+        torch.randn(size, out=tensor)
+    mpu.random._set_cuda_rng_state(rng_state)
+    for _ in range(5):
+        torch.randn(size, out=tensor)
+    result_2 = tensor.clone()
+
+    # Results should be the same
+    error = result_2.sub(result_1).abs().max()
+    print('   max error in generated tensors (should be zero) on '
+          'global rank {}: {}'.format(torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    # Input state should have remained intact.
+    error = rng_state.sub(rng_state_copy).max()
+    print('   max error in rng state (should be zero) on global rank {}: {}'.
+          format(torch.distributed.get_rank(), error))
+    assert error == 0
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+def test_cuda_rng_tracker(model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing cuda rng tracker with size {} ...'.format(
+            model_parallel_size))
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    seed_1 = 1234
+    seed_2 = 4321
+    size = [12, 21]
+    tensor = torch.cuda.FloatTensor(size)
+
+    # Set to seed_1 and generate two tensors.
+    torch.cuda.manual_seed(seed_1)
+    torch.randn(size, out=tensor)
+    target_11 = tensor.clone()
+    torch.randn(size, out=tensor)
+    target_12 = tensor.clone()
+
+    # Set to seed_2 and generate two tensors.
+    torch.cuda.manual_seed(seed_2)
+    torch.randn(size, out=tensor)
+    target_21 = tensor.clone()
+    torch.randn(size, out=tensor)
+    target_22 = tensor.clone()
+
+    # Now if we interleave seed_1 and seed_2,
+    # we should still get the same tensors
+    torch.cuda.manual_seed(seed_1)
+    mpu.get_cuda_rng_tracker().add('test', seed_2)
+
+    torch.randn(size, out=tensor)
+    result_11 = tensor.clone()
+
+    with mpu.get_cuda_rng_tracker().fork('test'):
+        torch.randn(size, out=tensor)
+        result_21 = tensor.clone()
+
+    torch.randn(size, out=tensor)
+    result_12 = tensor.clone()
+
+    with mpu.get_cuda_rng_tracker().fork('test'):
+        torch.randn(size, out=tensor)
+        result_22 = tensor.clone()
+
+    diff = result_11.sub(result_21).abs().max()
+    diff = min(diff, result_12.sub(result_22).abs().max())
+    print('   max diff in generated tensors (should be non-zero) on '
+          'global rank {}: {}'.format(torch.distributed.get_rank(), diff))
+    assert diff > 1.0e-6
+    error = max(
+        result_11.sub(target_11).abs().max(),
+        result_12.sub(target_12).abs().max())
+    error = max(error, result_21.sub(target_21).abs().max())
+    error = max(error, result_22.sub(target_22).abs().max())
+    print('   max error in generated tensors (should be zero) on '
+          'global rank {}: {}'.format(torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    # Reset the tracker
+    mpu.get_cuda_rng_tracker().reset()
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+def test_model_parallel_cuda_manual_seed(model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing model parallel cuda manual seed with size {} ...'.
+              format(model_parallel_size))
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    mpu.model_parallel_cuda_manual_seed(12345)
+    assert torch.cuda.initial_seed() == 12345
+    with mpu.get_cuda_rng_tracker().fork():
+        assert torch.cuda.initial_seed() == (12345 + 2718
+                                             + mpu.get_model_parallel_rank())
+
+    # Reset the tracker
+    mpu.get_cuda_rng_tracker().reset()
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+if __name__ == '__main__':
+
+    initialize_distributed()
+    world_size = torch.distributed.get_world_size()
+
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        print_separator('test set rng state')
+        test_set_cuda_rng_state(model_parallel_size)
+        model_parallel_size *= 2
+
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        print_separator('test cuda rng tracker')
+        test_cuda_rng_tracker(model_parallel_size)
+        model_parallel_size *= 2
+
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        print_separator('test model parallel cuda manual seed')
+        test_model_parallel_cuda_manual_seed(model_parallel_size)
+        model_parallel_size *= 2
diff --git a/modelscope/models/nlp/mglm/mpu/transformer.py b/modelscope/models/nlp/mglm/mpu/transformer.py
new file mode 100755
index 00000000..c12b2e10
--- /dev/null
+++ b/modelscope/models/nlp/mglm/mpu/transformer.py
@@ -0,0 +1,1200 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Transformer."""
+
+import math
+
+import deepspeed
+import torch
+import torch.nn.init as init
+from apex.normalization.fused_layer_norm import FusedLayerNorm as LayerNorm
+
+from .initialize import get_model_parallel_world_size
+from .layers import ColumnParallelLinear, RowParallelLinear
+from .mappings import gather_from_model_parallel_region
+from .random import checkpoint, get_cuda_rng_tracker
+from .utils import divide, split_tensor_along_last_dim
+
+
+class PositionalEmbedding(torch.nn.Module):
+
+    def __init__(self, hidden_size):
+        super(PositionalEmbedding, self).__init__()
+
+        self.hidden_size = hidden_size
+
+        inv_freq = 1 / (
+            10000**(torch.arange(0.0, hidden_size, 2.0) / hidden_size))  # noqa
+        self.register_buffer('inv_freq', inv_freq)
+
+    def forward(self, pos_seq, bsz=None):
+        sinusoid_inp = torch.ger(pos_seq, self.inv_freq)
+        pos_emb = torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=-1)
+
+        if bsz is not None:
+            return pos_emb[None, :, :].expand(bsz, -1, -1)
+        else:
+            return pos_emb[None, :, :]
+
+
+class ParallelCrossAttention(torch.nn.Module):
+    """Parallel cross-attention layer for Transformer"""
+
+    def __init__(self,
+                 hidden_size,
+                 num_attention_heads,
+                 attention_dropout_prob,
+                 output_dropout_prob,
+                 init_method,
+                 output_layer_init_method=None):
+        super(ParallelCrossAttention, self).__init__()
+        # Set output layer initialization if not provided.
+        if output_layer_init_method is None:
+            output_layer_init_method = init_method
+        # Per attention head and per partition values.
+        world_size = get_model_parallel_world_size()
+        self.hidden_size_per_partition = divide(hidden_size, world_size)
+        self.hidden_size_per_attention_head = divide(hidden_size,
+                                                     num_attention_heads)
+        self.num_attention_heads_per_partition = divide(
+            num_attention_heads, world_size)
+        # Strided linear layer.
+        self.query = ColumnParallelLinear(
+            hidden_size,
+            hidden_size,
+            gather_output=False,
+            init_method=init_method)
+        self.key_value = ColumnParallelLinear(
+            hidden_size,
+            2 * hidden_size,
+            stride=2,
+            gather_output=False,
+            init_method=init_method)
+        # Dropout. Note that for a single iteration, this layer will generate
+        # different outputs on different number of parallel partitions but
+        # on average it should not be partition dependent.
+        self.attention_dropout = torch.nn.Dropout(attention_dropout_prob)
+
+        # Output.
+        self.dense = RowParallelLinear(
+            hidden_size,
+            hidden_size,
+            input_is_parallel=True,
+            init_method=output_layer_init_method)
+        self.output_dropout = torch.nn.Dropout(output_dropout_prob)
+
+        if deepspeed.checkpointing.is_configured():
+            global get_cuda_rng_tracker, checkpoint
+            get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker
+            checkpoint = deepspeed.checkpointing.checkpoint
+
+    def _transpose_for_scores(self, tensor):
+        """Transpose a 3D tensor [b, s, np*hn] into a 4D tensor with
+        size [b, np, s, hn].
+        """
+        new_tensor_shape = tensor.size()[:-1] + \
+                           (self.num_attention_heads_per_partition, # noqa
+                            self.hidden_size_per_attention_head) # noqa
+        tensor = tensor.view(*new_tensor_shape)
+        return tensor.permute(0, 2, 1, 3)
+
+    def forward(self, hidden_states, encoder_states, cross_mask):
+        # hidden_states: [b, s, h]
+        # ltor_mask: [1, 1, s, s]
+
+        # Attention heads. [b, s, hp]
+        mixed_query_layer = self.query(hidden_states)
+        mixed_x_layer = self.key_value(encoder_states)
+        (mixed_key_layer,
+         mixed_value_layer) = split_tensor_along_last_dim(mixed_x_layer, 2)
+
+        # Reshape and transpose [b, np, s, hn]
+        query_layer = self._transpose_for_scores(mixed_query_layer)
+        key_layer = self._transpose_for_scores(mixed_key_layer)
+        value_layer = self._transpose_for_scores(mixed_value_layer)
+        # Raw attention scores. [b, np, s, s]
+        attention_scores = torch.matmul(query_layer,
+                                        key_layer.transpose(-1, -2))
+        attention_scores = attention_scores / math.sqrt(
+            self.hidden_size_per_attention_head)
+        if cross_mask is not None:
+            # Apply the left to right attention mask.
+            attention_scores = torch.mul(attention_scores, cross_mask) - \
+                               10000.0 * (1.0 - cross_mask) # noqa
+
+        # Attention probabilities. [b, np, s, s]
+        attention_probs = torch.nn.Softmax(dim=-1)(attention_scores)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        with get_cuda_rng_tracker().fork():
+            attention_probs = self.attention_dropout(attention_probs)
+
+        # Context layer.
+        # [b, np, s, hn]
+        context_layer = torch.matmul(attention_probs, value_layer)
+        # [b, s, np, hn]
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + \
+                                  (self.hidden_size_per_partition,) # noqa
+        # [b, s, hp]
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        # Output. [b, s, h]
+        output = self.dense(context_layer)
+        output = self.output_dropout(output)
+
+        return output
+
+
+class ParallelSelfAttention(torch.nn.Module):
+    """Parallel self-attention layer for GPT2.
+
+    Self-attention layer takes input with size [b, s, h] where b is
+    the batch size, s is the sequence lenght, and h is the hidden size
+    and creates output of the same size.
+    Arguments:
+        hidden_size: total hidden size of the layer (h).
+        num_attention_heads: number of attention heads (n). Note that we
+                             require n to be divisible by number of GPUs
+                             used to parallelize the model. Also, we
+                             require hidden size to be divisible by n.
+        attention_dropout_prob: dropout probability for the attention scores.
+        init_method: weight initialization.
+        output_layer_init_method: output layer initialization. If None, use
+                                  `init_method`.
+    We use the following notation:
+        h: hidden_size
+        n: num_attention_heads
+        p: number of partitions
+        np: n/p
+        hp: h/p
+        hn: h/n
+        b: batch size
+        s: sequence length
+    """
+
+    def __init__(self,
+                 hidden_size,
+                 num_attention_heads,
+                 attention_dropout_prob,
+                 output_dropout_prob,
+                 init_method,
+                 output_layer_init_method=None,
+                 relative_encoding=False,
+                 performer=False,
+                 attention_scale=1.0):
+        super(ParallelSelfAttention, self).__init__()
+        self.performer = performer
+        # Set output layer initialization if not provided.
+        if output_layer_init_method is None:
+            output_layer_init_method = init_method
+        # Per attention head and per partition values.
+        world_size = get_model_parallel_world_size()
+        self.hidden_size_per_partition = divide(hidden_size, world_size)
+        self.hidden_size_per_attention_head = divide(hidden_size,
+                                                     num_attention_heads)
+        self.num_attention_heads_per_partition = divide(
+            num_attention_heads, world_size)
+        self.relative_encoding = relative_encoding
+        self.attention_scale = attention_scale
+        # Strided linear layer.
+        self.query_key_value = ColumnParallelLinear(
+            hidden_size,
+            3 * hidden_size,
+            stride=3,
+            gather_output=False,
+            init_method=init_method)
+        if relative_encoding:
+            self.relative = ColumnParallelLinear(
+                hidden_size,
+                hidden_size,
+                gather_output=False,
+                init_method=init_method)
+        # Dropout. Note that for a single iteration, this layer will generate
+        # different outputs on different number of parallel partitions but
+        # on average it should not be partition dependent.
+        self.attention_dropout = torch.nn.Dropout(attention_dropout_prob)
+
+        # Output.
+        self.dense = RowParallelLinear(
+            hidden_size,
+            hidden_size,
+            input_is_parallel=True,
+            init_method=output_layer_init_method)
+        self.output_dropout = torch.nn.Dropout(output_dropout_prob)
+
+        if deepspeed.checkpointing.is_configured():
+            global get_cuda_rng_tracker, checkpoint
+            get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker
+            checkpoint = deepspeed.checkpointing.checkpoint
+
+    def _transpose_for_scores(self, tensor):
+        """Transpose a 3D tensor [b, s, np*hn] into a 4D tensor with
+        size [b, np, s, hn].
+        """
+        new_tensor_shape = tensor.size()[:-1] + \
+                           (self.num_attention_heads_per_partition, # noqa
+                            self.hidden_size_per_attention_head) # noqa
+        tensor = tensor.view(*new_tensor_shape)
+        return tensor.permute(0, 2, 1, 3)
+
+    @staticmethod
+    def _rel_shift(x, zero_triu=False):
+        # ql x kl x bsz x h
+        # bsz x h x ql x kl
+        zero_pad = torch.zeros((*x.size()[:-2], x.size(-2), 1),
+                               device=x.device,
+                               dtype=x.dtype)
+        x_padded = torch.cat([zero_pad, x], dim=-1)
+
+        x_padded = x_padded.view(*x.size()[:-2], x.size(-1) + 1, x.size(-2))
+
+        x = x_padded[:, :, 1:].view_as(x)
+
+        if zero_triu:
+            ones = torch.ones((x.size(0), x.size(1)))
+            x = x * torch.tril(ones, x.size(1) - x.size(0))[:, :, None, None]
+
+        return x
+
+    def forward(self,
+                hidden_states,
+                ltor_mask,
+                position_embeddings=None,
+                r_w_bias=None,
+                r_r_bias=None,
+                mem=None):
+        # hidden_states: [b, s, h]
+        # ltor_mask: [1, 1, s, s]
+
+        # Attention heads. [b, s, hp]
+        query_length = hidden_states.size(1)
+
+        if mem is None:
+            mixed_x_layer = self.query_key_value(hidden_states)
+            (mixed_query_layer, mixed_key_layer,
+             mixed_value_layer) = split_tensor_along_last_dim(
+                 mixed_x_layer, 3)
+        else:
+            cat = torch.cat((mem, hidden_states), 1)
+            mixed_x_layer = self.query_key_value(cat)
+            (mixed_query_layer, mixed_key_layer,
+             mixed_value_layer) = split_tensor_along_last_dim(
+                 mixed_x_layer, 3)
+            mixed_query_layer = mixed_query_layer[:, -query_length:]
+
+        # Reshape and transpose [b, np, s, hn]
+        query_layer = self._transpose_for_scores(mixed_query_layer)
+        key_layer = self._transpose_for_scores(mixed_key_layer)
+        value_layer = self._transpose_for_scores(mixed_value_layer)
+        if self.relative_encoding:
+            relative_layer = self.relative(position_embeddings)
+            relative_layer = self._transpose_for_scores(
+                relative_layer)  # 1 (bsz) x n_head x klen x d_head
+            # Raw attention scores. [b, np, qs, ks]
+            rw_head_q = query_layer + r_w_bias.unsqueeze(1)
+            ac_score = torch.matmul(rw_head_q, key_layer.transpose(-1, -2))
+            rr_head_q = query_layer + r_r_bias.unsqueeze(1)
+            bd_score = torch.matmul(rr_head_q,
+                                    relative_layer.transpose(-1, -2))
+            bd_score = self._rel_shift(bd_score)  # qlen x klen x bsz x n_head
+            # bd_score = bd_score.permute(2, 3, 0, 1) # bsz n_head qlen klen
+
+            attention_scores = ac_score + bd_score
+            attention_scores = attention_scores / math.sqrt(
+                self.hidden_size_per_attention_head)
+        else:
+            if self.attention_scale > 1.0:
+                # Raw attention scores. [b, np, s, s]
+                attention_scores = torch.matmul(
+                    query_layer / math.sqrt(self.attention_scale),
+                    key_layer.transpose(-1, -2)
+                    / math.sqrt(self.hidden_size_per_attention_head
+                                * self.attention_scale))
+            else:
+                attention_scores = torch.matmul(
+                    query_layer,
+                    key_layer.transpose(-1, -2)
+                    / math.sqrt(self.hidden_size_per_attention_head))
+
+        # Apply the left to right attention mask.
+        attention_scores = torch.mul(attention_scores, ltor_mask)
+        if self.attention_scale > 1.0:
+            max_attention_scores = attention_scores.max(
+                dim=-1, keepdim=True)[0]
+            attention_scores -= max_attention_scores
+            attention_scores *= self.attention_scale
+        # if torch.distributed.get_rank() == 0:
+        #     print(min_attention_scores, attention_scores.max().item())
+        attention_scores = attention_scores + (-65504.0) * (1.0 - ltor_mask)
+        # Attention probabilities. [b, np, s, s]
+        attention_probs = torch.nn.Softmax(dim=-1)(attention_scores)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        with get_cuda_rng_tracker().fork():
+            attention_probs = self.attention_dropout(attention_probs)
+
+        # Context layer.
+        # [b, np, s, hn]
+        context_layer = torch.matmul(attention_probs, value_layer)
+        # [b, s, np, hn]
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + \
+                                  (self.hidden_size_per_partition,) # noqa
+        # [b, s, hp]
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        # Output. [b, s, h]
+        output = self.dense(context_layer)
+        output = self.output_dropout(output)
+
+        return output
+
+
+@torch.jit.script
+def gelu_impl(x):
+    """OpenAI's gelu implementation."""
+    return 0.5 * x * (
+        1.0 + torch.tanh(0.7978845608028654 * x *  # noqa
+                         (1.0 + 0.044715 * x * x)))  # noqa
+
+
+def gelu(x):
+    return gelu_impl(x)
+
+
+class ParallelMLP(torch.nn.Module):
+    """MLP for GPT2.
+
+    MLP will take the input with h hidden state, project it to 4*h
+    hidden dimension, perform gelu transformation, and project the
+    state back into h hidden dimension. At the end, dropout is also
+    applied.
+
+    Arguments:
+        hidden_size: The hidden size of the self attention.
+        output_dropout_prob: dropout probability for the outputs
+                             after self attention and final output.
+        init_method: initialization method used for the weights. Note
+                     that all biases are initialized to zero and
+                     layernorm weight are initialized to one.
+        output_layer_init_method: output layer initialization. If None,
+                                  use `init_method`.
+    """
+
+    def __init__(self,
+                 hidden_size,
+                 output_dropout_prob,
+                 init_method,
+                 output_layer_init_method=None):
+        super(ParallelMLP, self).__init__()
+        # Set output layer initialization if not provided.
+        if output_layer_init_method is None:
+            output_layer_init_method = init_method
+        # Project to 4h.
+        self.dense_h_to_4h = ColumnParallelLinear(
+            hidden_size,
+            4 * hidden_size,
+            gather_output=False,
+            init_method=init_method)
+        # Project back to h.
+        self.dense_4h_to_h = RowParallelLinear(
+            4 * hidden_size,
+            hidden_size,
+            input_is_parallel=True,
+            init_method=output_layer_init_method)
+        self.dropout = torch.nn.Dropout(output_dropout_prob)
+
+    def forward(self, hidden_states):
+        # [b, s, 4hp]
+        intermediate_parallel = self.dense_h_to_4h(hidden_states)
+        intermediate_parallel = gelu(intermediate_parallel)
+
+        # [b, s, h]
+        output = self.dense_4h_to_h(intermediate_parallel)
+        output = self.dropout(output)
+        return output
+
+
+class ParallelDecoderLayer(torch.nn.Module):
+    """A single layer transformer for GPT2.
+
+    We use the following notation:
+        h: hidden size
+        n: number of attention heads
+        b: batch size
+        s: sequence length
+    Transformore layer takes input with size [b, s, h] and returns an
+    output of the same size.
+
+    Arguments:
+        hidden_size: The hidden size of the self attention.
+        num_attention_heads: number of attention head in the self
+                             attention.
+        attention_dropout_prob: dropout probability of the attention
+                                score in self attention.
+        output_dropout_prob: dropout probability for the outputs
+                             after self attention and final output.
+        layernorm_epsilon: epsilon used in layernorm to avoid
+                           division by zero.
+        init_method: initialization method used for the weights. Note
+                     that all biases are initialized to zero and
+                     layernorm weight are initialized to one.
+        output_layer_init_method: output layers (attention output and
+                                  mlp output) initialization. If None,
+                                  use `init_method`.
+    """
+
+    def __init__(self,
+                 hidden_size,
+                 num_attention_heads,
+                 attention_dropout_prob,
+                 output_dropout_prob,
+                 layernorm_epsilon,
+                 init_method,
+                 output_layer_init_method=None):
+        super(ParallelDecoderLayer, self).__init__()
+        # Set output layer initialization if not provided.
+        if output_layer_init_method is None:
+            output_layer_init_method = init_method
+
+        # Layernorm on the input data.
+        self.input_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon)
+
+        # Self attention.
+        self.self_attention = ParallelSelfAttention(
+            hidden_size,
+            num_attention_heads,
+            attention_dropout_prob,
+            output_dropout_prob,
+            init_method,
+            output_layer_init_method=output_layer_init_method)
+
+        # Layernorm after the self attention.
+        self.post_self_layernorm = LayerNorm(
+            hidden_size, eps=layernorm_epsilon)
+
+        self.cross_attention = ParallelCrossAttention(
+            hidden_size,
+            num_attention_heads,
+            attention_dropout_prob,
+            output_dropout_prob,
+            init_method,
+            output_layer_init_method=output_layer_init_method)
+
+        # Layernorm after the cross attention.
+        self.post_attention_layernorm = LayerNorm(
+            hidden_size, eps=layernorm_epsilon)
+
+        # MLP
+        self.mlp = ParallelMLP(
+            hidden_size,
+            output_dropout_prob,
+            init_method,
+            output_layer_init_method=output_layer_init_method)
+
+    def forward(self,
+                hidden_states,
+                encoder_states,
+                ltor_mask,
+                cross_mask=None):
+        # hidden_states: [b, s, h]
+        # ltor_mask: [1, 1, s, s]
+
+        # Layer norm at the begining of the transformer layer.
+        layernorm_output = self.input_layernorm(hidden_states)
+        # Self attention.
+        self_attention_output = self.self_attention(layernorm_output,
+                                                    ltor_mask)
+        # Residual connection.
+        self_layernorm_input = hidden_states + self_attention_output
+        # Layer norm post the self attention.
+        self_layernorm_output = self.post_self_layernorm(self_layernorm_input)
+        # Cross attention
+        attention_output = self.cross_attention(self_layernorm_output,
+                                                encoder_states, cross_mask)
+        # Residual connection
+        layernorm_input = self_layernorm_input + attention_output
+        # Layer norm post the cross attention
+        layernorm_output = self.post_attention_layernorm(layernorm_input)
+        # MLP.
+        mlp_output = self.mlp(layernorm_output)
+        # Second residual connection.
+        output = layernorm_input + mlp_output
+        return output
+
+
+class ParallelTransformerLayer(torch.nn.Module):
+    """A single layer transformer for GPT2.
+
+    We use the following notation:
+        h: hidden size
+        n: number of attention heads
+        b: batch size
+        s: sequence length
+    Transformore layer takes input with size [b, s, h] and returns an
+    output of the same size.
+
+    Arguments:
+        hidden_size: The hidden size of the self attention.
+        num_attention_heads: number of attention head in the self
+                             attention.
+        attention_dropout_prob: dropout probability of the attention
+                                score in self attention.
+        output_dropout_prob: dropout probability for the outputs
+                             after self attention and final output.
+        layernorm_epsilon: epsilon used in layernorm to avoid
+                           division by zero.
+        init_method: initialization method used for the weights. Note
+                     that all biases are initialized to zero and
+                     layernorm weight are initialized to one.
+        output_layer_init_method: output layers (attention output and
+                                  mlp output) initialization. If None,
+                                  use `init_method`.
+    """
+
+    def __init__(self,
+                 hidden_size,
+                 num_attention_heads,
+                 attention_dropout_prob,
+                 output_dropout_prob,
+                 layernorm_epsilon,
+                 init_method,
+                 output_layer_init_method=None,
+                 relative_encoding=False,
+                 performer=False,
+                 attention_scale=1.0):
+        super(ParallelTransformerLayer, self).__init__()
+        # Set output layer initialization if not provided.
+        if output_layer_init_method is None:
+            output_layer_init_method = init_method
+
+        # Layernorm on the input data.
+        self.input_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon)
+
+        # Self attention.
+        self.attention = ParallelSelfAttention(
+            hidden_size,
+            num_attention_heads,
+            attention_dropout_prob,
+            output_dropout_prob,
+            init_method,
+            output_layer_init_method=output_layer_init_method,
+            relative_encoding=relative_encoding,
+            performer=performer,
+            attention_scale=attention_scale)
+
+        # Layernorm on the input data.
+        self.post_attention_layernorm = LayerNorm(
+            hidden_size, eps=layernorm_epsilon)
+
+        # MLP
+        self.mlp = ParallelMLP(
+            hidden_size,
+            output_dropout_prob,
+            init_method,
+            output_layer_init_method=output_layer_init_method)
+
+    def forward(self,
+                hidden_states,
+                ltor_mask,
+                position_embeddings=None,
+                r_w_bias=None,
+                r_r_bias=None,
+                mem=None):
+        # hidden_states: [b, s, h]
+        # ltor_mask: [1, 1, s, s]
+
+        # Layer norm at the begining of the transformer layer.
+        layernorm_output = self.input_layernorm(hidden_states)
+        mem = self.input_layernorm(mem) if mem is not None else None
+        # Self attention.
+        attention_output = self.attention(layernorm_output, ltor_mask,
+                                          position_embeddings, r_w_bias,
+                                          r_r_bias, mem)
+        # Residual connection.
+        layernorm_input = hidden_states + attention_output
+        # Layer norm post the self attention.
+        layernorm_output = self.post_attention_layernorm(layernorm_input)
+        # MLP.
+        mlp_output = self.mlp(layernorm_output)
+        # Second residual connection.
+        output = layernorm_input + mlp_output
+
+        return output
+
+
+def unscaled_init_method(sigma):
+    """Init method based on N(0, sigma)."""
+
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)
+
+    return init_
+
+
+def scaled_init_method(sigma, num_layers):
+    """Init method based on N(0, sigma/sqrt(2*num_layers)."""
+    std = sigma / math.sqrt(2.0 * num_layers)
+
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=0.0, std=std)
+
+    return init_
+
+
+class GPT2ParallelTransformer(torch.nn.Module):
+    """GPT-2 transformer.
+
+    This module takes input from embedding layer and it's output can
+    be used directly by a logit layer. It consists of L (num-layers)
+    blocks of:
+        layer norm
+        self attention
+        residual connection
+        layer norm
+        mlp
+        residual connection
+    followed by a final layer norm.
+
+    Arguments:
+        num_layers: Number of transformer layers.
+        hidden_size: The hidden size of the self attention.
+        num_attention_heads: number of attention head in the self
+                             attention.
+        attention_dropout_prob: dropout probability of the attention
+                                score in self attention.
+        output_dropout_prob: dropout probability for the outputs
+                             after self attention and final output.
+        checkpoint_activations: if True, checkpoint activations.
+        checkpoint_num_layers: number of layers to checkpoint. This
+                               is basically the chunk size in checkpoitning.
+        layernorm_epsilon: epsilon used in layernorm to avoid
+                           division by zero.
+        init_method_std: standard deviation of the init method which has
+                         the form N(0, std).
+        use_scaled_init_for_output_weights: If Ture use 1/sqrt(2*num_layers)
+                                            scaling for the output weights (
+                                            output of self attention and mlp).
+    """
+
+    def __init__(
+        self,
+        num_layers,
+        hidden_size,
+        num_attention_heads,
+        max_sequence_length,
+        max_memory_length,
+        embedding_dropout_prob,
+        attention_dropout_prob,
+        output_dropout_prob,
+        checkpoint_activations,
+        checkpoint_num_layers=1,
+        layernorm_epsilon=1.0e-5,
+        init_method_std=0.02,
+        use_scaled_init_for_output_weights=True,
+        relative_encoding=False,
+        block_position_encoding=False,
+        performer=False,
+        use_decoder_layer=False,
+        attention_scale=1.0,
+    ):
+        super(GPT2ParallelTransformer, self).__init__()
+        self.hidden_size = hidden_size
+        # Store activation checkpoiting flag.
+        self.checkpoint_activations = checkpoint_activations
+        self.checkpoint_num_layers = checkpoint_num_layers
+        self.max_memory_length = max_memory_length
+        self.performer = performer
+        self.use_decoder_layer = use_decoder_layer
+        assert not (performer and relative_encoding)
+
+        output_layer_init_method = None
+        if use_scaled_init_for_output_weights:
+            output_layer_init_method = scaled_init_method(
+                init_method_std, num_layers)
+        # Embeddings dropout
+        self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob)
+        self.relative_encoding = relative_encoding
+        self.block_position_encoding = block_position_encoding
+        if relative_encoding:
+            # Relative position embedding
+            self.position_embeddings = PositionalEmbedding(hidden_size)
+            # Per attention head and per partition values.
+            world_size = get_model_parallel_world_size()
+            self.hidden_size_per_attention_head = divide(
+                hidden_size, num_attention_heads)
+            self.num_attention_heads_per_partition = divide(
+                num_attention_heads, world_size)
+            self.r_w_bias = torch.nn.Parameter(
+                torch.Tensor(self.num_attention_heads_per_partition,
+                             self.hidden_size_per_attention_head))
+            self.r_w_bias.model_parallel = True
+            self.r_r_bias = torch.nn.Parameter(
+                torch.Tensor(self.num_attention_heads_per_partition,
+                             self.hidden_size_per_attention_head))
+            self.r_r_bias.model_parallel = True
+            # Always initialize bias to zero.
+            with torch.no_grad():
+                self.r_w_bias.zero_()
+                self.r_r_bias.zero_()
+        else:
+            # Position embedding (serial).
+            if block_position_encoding:
+                self.position_embeddings = torch.nn.Embedding(
+                    max_sequence_length + 1, hidden_size)
+                self.block_position_embeddings = torch.nn.Embedding(
+                    max_sequence_length + 1, hidden_size)
+                torch.nn.init.normal_(
+                    self.block_position_embeddings.weight,
+                    mean=0.0,
+                    std=init_method_std)
+            else:
+                self.position_embeddings = torch.nn.Embedding(
+                    max_sequence_length, hidden_size)
+            # Initialize the position embeddings.
+            torch.nn.init.normal_(
+                self.position_embeddings.weight, mean=0.0, std=init_method_std)
+
+        def get_layer():
+            if use_decoder_layer:
+                return ParallelDecoderLayer(
+                    hidden_size,
+                    num_attention_heads,
+                    attention_dropout_prob,
+                    output_dropout_prob,
+                    layernorm_epsilon,
+                    unscaled_init_method(init_method_std),
+                    output_layer_init_method=output_layer_init_method)
+            else:
+                return ParallelTransformerLayer(
+                    hidden_size,
+                    num_attention_heads,
+                    attention_dropout_prob,
+                    output_dropout_prob,
+                    layernorm_epsilon,
+                    unscaled_init_method(init_method_std),
+                    output_layer_init_method=output_layer_init_method,
+                    relative_encoding=relative_encoding,
+                    performer=performer,
+                    attention_scale=attention_scale)
+
+        # Transformer layers.
+        self.layers = torch.nn.ModuleList(
+            [get_layer() for _ in range(num_layers)])
+
+        # Final layer norm before output.
+        self.final_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon)
+
+        if deepspeed.checkpointing.is_configured():
+            global get_cuda_rng_tracker, checkpoint
+            get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker
+            checkpoint = deepspeed.checkpointing.checkpoint
+
+    def forward(self,
+                hidden_states,
+                position_ids,
+                attention_mask,
+                memory_states=None,
+                encoder_states=None,
+                return_memory=False,
+                detach_memory=True):
+        batch_size, query_length = hidden_states.size()[:2]
+        memory_length = memory_states[0].size(1) if memory_states else 0
+        key_length = query_length + memory_length
+        # attention mask is the beginning postion of B region, \in [0, query_len)
+        is_scalar = torch.numel(attention_mask) == 1
+        is_sep = is_scalar or torch.numel(attention_mask) == batch_size
+        if self.performer:
+            assert is_scalar, 'attention_mask should be a scalar to indicate the seperation position.'
+            assert memory_length == 0, 'Do not support transformer-xl.'
+        if is_sep:
+            sep = attention_mask.item() if is_scalar else attention_mask
+
+            # conventional transformer
+            def build_mask_matrix(seq_length, sep, memory_length=0):
+                m = hidden_states.new_ones((1, seq_length, seq_length))
+                m = torch.tril(m)
+                if is_scalar:
+                    m[0, :, :sep] = 1
+                else:
+                    m = m.expand(batch_size, -1, -1)
+                    ids = torch.arange(
+                        seq_length, device=sep.device,
+                        dtype=sep.dtype).view(1, -1)
+                    mask = ids < sep.view(-1, 1)
+                    m = m.masked_fill(mask.unsqueeze(1).expand_as(m), 1)
+                if memory_length > 0:
+                    m = m.expand(batch_size, -1, -1)
+                    m = torch.cat(
+                        (hidden_states.new_ones((batch_size, seq_length,
+                                                 memory_length)), m),  # noqa
+                        dim=2)  # noqa
+                m = m.unsqueeze(1)
+                return m
+
+            if not self.performer:
+                attention_mask = build_mask_matrix(
+                    query_length, sep, memory_length=memory_length)
+        else:
+            attention_mask = attention_mask[:, :, :,
+                                            -query_length - memory_length:]
+
+        if self.relative_encoding:
+            position_sequence = torch.arange(
+                key_length - 1,
+                -1,
+                -1.0,
+                device=hidden_states.device,
+                dtype=hidden_states.dtype)
+            position_embeddings = self.position_embeddings(position_sequence)
+            # Apply dropout
+            position_embeddings = self.embedding_dropout(position_embeddings)
+        else:
+            if self.block_position_encoding:
+                position_ids, block_position_ids = position_ids[:,
+                                                                0], position_ids[:,
+                                                                                 1]
+            position_embeddings = self.position_embeddings(position_ids)
+            hidden_states = hidden_states + position_embeddings
+            if self.block_position_encoding:
+                block_position_embeddings = self.block_position_embeddings(
+                    block_position_ids)
+                hidden_states = hidden_states + block_position_embeddings
+        hidden_states = self.embedding_dropout(hidden_states)
+
+        def check_detach(_hidden_states):
+            if detach_memory:
+                return _hidden_states.detach()
+            return _hidden_states
+
+        if self.max_memory_length > 0 or return_memory:
+            mem_layers = [check_detach(hidden_states)]
+        else:
+            mem_layers = []
+
+        def custom(start, end):
+
+            def custom_forward(*inputs):
+                layers_ = self.layers[start:end]
+                x_, inputs = inputs[0], inputs[1:]
+                if self.relative_encoding:
+                    inputs, mems_ = inputs[:4], inputs[4:]
+                else:
+                    inputs, mems_ = inputs[:1], inputs[1:]
+                for i, layer in enumerate(layers_):
+                    mem_i_ = mems_[i] if mems_ else None
+                    x_ = layer(x_, *inputs, mem=mem_i_)
+                    if self.max_memory_length > 0 or return_memory:
+                        mem_layers.append(check_detach(x_))
+                return x_
+
+            return custom_forward
+
+        if self.checkpoint_activations:
+            l = 0  # noqa
+            num_layers = len(self.layers)
+            chunk_length = self.checkpoint_num_layers
+            while l < num_layers:
+                args = [hidden_states, attention_mask
+                        ] if not self.use_decoder_layer else [
+                            hidden_states,
+                            encoder_states,
+                            attention_mask  # noqa
+                        ]  # noqa
+                if self.relative_encoding:
+                    args += [position_embeddings, self.r_w_bias, self.r_r_bias]
+                if memory_states:
+                    args += memory_states[l:l + chunk_length]
+                hidden_states = checkpoint(custom(l, l + chunk_length), *args)
+                l += chunk_length  # noqa
+        else:
+            for i, layer in enumerate(self.layers):
+                args = [hidden_states, attention_mask
+                        ] if not self.use_decoder_layer else [
+                            hidden_states,
+                            encoder_states,
+                            attention_mask  # noqa
+                        ]  # noqa
+                if self.relative_encoding:
+                    args += [position_embeddings, self.r_w_bias, self.r_r_bias]
+                mem_i = memory_states[i] if memory_states else None
+                hidden_states = layer(*args, mem=mem_i)
+                if self.max_memory_length > 0 or return_memory:
+                    mem_layers.append(check_detach(hidden_states))
+
+        # Final layer norm.
+        output = self.final_layernorm(hidden_states)
+        if self.max_memory_length > 0 or return_memory:
+            mem_layers = self.update_mems(
+                mem_layers, memory_states, return_memory=return_memory)
+
+        return (output, mem_layers)
+
+    def update_mems(self, hiddens, mems, return_memory=False):
+        memory_length = mems[0].size(1) if mems else 0
+        query_length = hiddens[0].size(1)
+        new_memory_length = memory_length + query_length
+        if not return_memory:
+            new_memory_length = min(self.max_memory_length, new_memory_length)
+        new_mems = []
+        # with torch.no_grad():
+        for i in range(len(hiddens)):
+            if new_memory_length <= query_length:
+                new_mems.append(hiddens[i][:, -new_memory_length:])
+            else:
+                new_mems.append(
+                    torch.cat((mems[i][:, -new_memory_length + query_length:],
+                               hiddens[i]),
+                              dim=1))
+        return new_mems
+
+
+class BertParallelSelfAttention(torch.nn.Module):
+    """Parallel self-attention layer for BERT.
+
+    Self-attention layer takes input with size [b, s, h] where b is
+    the batch size, s is the sequence lenght, and h is the hidden size
+    and creates output of the same size.
+    Arguments:
+        hidden_size: total hidden size of the layer (h).
+        num_attention_heads: number of attention heads (n). Note that we
+                             require n to be divisible by number of GPUs
+                             used to parallelize the model. Also, we
+                             require hidden size be divisible by n.
+        dropout_prob: dropout probability for the attention scores.
+        output_parallel: If true, no all-gather is done on the output and
+                         the output values will be per partition.
+    We use the following notation:
+        h: hidden_size
+        n: num_attention_heads
+        p: number of partitions
+        np: n/p
+        hp: h/p
+        hn: h/n
+        b: batch size
+        s: sequence length
+    """
+
+    def __init__(self,
+                 hidden_size,
+                 num_attention_heads,
+                 dropout_prob,
+                 output_parallel=False,
+                 init_method=init.xavier_normal_):
+        super(BertParallelSelfAttention, self).__init__()
+        # Input configuration.
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+        self.dropout_prob = dropout_prob
+        self.output_parallel = output_parallel
+        # Per attention head and per partition values.
+        world_size = get_model_parallel_world_size()
+        self.hidden_size_per_partition = divide(hidden_size, world_size)
+        self.hidden_size_per_attention_head = divide(hidden_size,
+                                                     num_attention_heads)
+        self.num_attention_heads_per_partition = divide(
+            num_attention_heads, world_size)
+        # Strided linear layer.
+        self.query_key_value = ColumnParallelLinear(
+            hidden_size,
+            3 * hidden_size,
+            stride=3,
+            gather_output=False,
+            init_method=init_method)
+        # Dropout. Note that for a single iteration, this layer will generate
+        # different outputs on different number of parallel partitions but
+        # on average it should not be partition dependent.
+        self.dropout = torch.nn.Dropout(dropout_prob)
+
+        if deepspeed.checkpointing.is_configured():
+            global get_cuda_rng_tracker, checkpoint
+            get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker
+            checkpoint = deepspeed.checkpointing.checkpoint
+
+    def _transpose_for_scores(self, tensor):
+        """Transpose a 3D tensor [b, s, np*hn] into a 4D tensor with
+        size [b, np, s, hn].
+        """
+        new_tensor_shape = tensor.size()[:-1] + \
+                           (self.num_attention_heads_per_partition, # noqa
+                            self.hidden_size_per_attention_head) # noqa
+        tensor = tensor.view(*new_tensor_shape)
+        return tensor.permute(0, 2, 1, 3)
+
+    def forward(self, hidden_states, attention_mask):
+
+        # Attention heads. [b, s, hp]
+        mixed_x_layer = self.query_key_value(hidden_states)
+        (mixed_query_layer, mixed_key_layer,
+         mixed_value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3)
+
+        # Reshape and transpose [b, np, s, hn]
+        query_layer = self._transpose_for_scores(mixed_query_layer)
+        key_layer = self._transpose_for_scores(mixed_key_layer)
+        value_layer = self._transpose_for_scores(mixed_value_layer)
+
+        # Raw attention scores. [b, np, s, s]
+        norm_factor = math.sqrt(math.sqrt(self.hidden_size_per_attention_head))
+        attention_scores = torch.matmul(
+            query_layer / norm_factor,
+            key_layer.transpose(-1, -2) / norm_factor)
+        # Apply the attention mask.
+        attention_scores += attention_mask
+
+        # Attention probabilities. [b, np, s, s]
+        attention_probs = torch.nn.Softmax(dim=-1)(attention_scores)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        with get_cuda_rng_tracker().fork():
+            attention_probs = self.dropout(attention_probs)
+
+        # Context layer.
+        # [b, np, s, hn]
+        context_layer = torch.matmul(attention_probs, value_layer)
+        # [b, s, np, hn]
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (
+            self.hidden_size_per_partition, )  # noqa
+        # [b, s, hp]
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        # Output. [b, s, h]
+        if self.output_parallel:
+            output = context_layer
+        else:
+            output = gather_from_model_parallel_region(context_layer)
+
+        return output
+
+
+class BertParallelTransformerOutput(torch.nn.Module):
+    """The output layer used after self attention and intermediate
+    parts of transformer layer."""
+
+    def __init__(self,
+                 input_size,
+                 output_size,
+                 dropout_prob,
+                 layernorm_epsilon=1.0e-12,
+                 input_is_parallel=False,
+                 init_method=init.xavier_normal_):
+        super(BertParallelTransformerOutput, self).__init__()
+        # Components.
+        self.dense = RowParallelLinear(
+            input_size,
+            output_size,
+            input_is_parallel=input_is_parallel,
+            init_method=init_method)
+        self.dropout = torch.nn.Dropout(dropout_prob)
+        self.layernorm = LayerNorm(output_size, eps=layernorm_epsilon)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        layernorm_input = hidden_states + input_tensor
+        hidden_states = self.layernorm(layernorm_input)
+        return hidden_states
+
+
+class BertParallelTransformerLayer(torch.nn.Module):
+    """A single layer transformer for Bert.
+
+    We use the following notation:
+        h: hidden size
+        n: number of attention heads
+        b: batch size
+        s: sequence length
+    Transformore layer takes input with size [b, s, h] and returns an
+    output of the same size.
+
+    Arguments:
+        hidden_size: The hidden size of the self attention.
+        intermediate_size: size of the intermediate state after
+                           self attention. In both BERT and GPT
+                           this is set to be 4 times the hidden
+                           size.
+        num_attention_heads: number of attention head in the self
+                             attention.
+        attention_dropout_prob: dropout probability of the attention
+                                score in self attention.
+        output_dropout_prob: dropout probability for the outputs
+                             after self attention and final output.
+        intermediate_activation_fn: activation function for output
+                                    of intermediate.
+        layernorm_epsilon: epsilon used in layernorm to avoid
+                           division by zero.
+        init_method: initialization method used for the weights. Note
+                     that all biases are initialized to zero and
+                     layernorm weight are initialized to one.
+    """
+
+    def __init__(self,
+                 hidden_size,
+                 intermediate_size,
+                 num_attention_heads,
+                 attention_dropout_prob,
+                 output_dropout_prob,
+                 intermediate_activation_fn,
+                 layernorm_epsilon,
+                 init_method=init.xavier_normal_):
+        super(BertParallelTransformerLayer, self).__init__()
+
+        # Self attention.
+        self.attention = BertParallelSelfAttention(
+            hidden_size,
+            num_attention_heads,
+            attention_dropout_prob,
+            output_parallel=True,
+            init_method=init_method)
+        # Self attention output.
+        self.self_output = BertParallelTransformerOutput(
+            hidden_size,
+            hidden_size,
+            output_dropout_prob,
+            layernorm_epsilon=layernorm_epsilon,
+            input_is_parallel=True,
+            init_method=init_method)
+        # Intermediate.
+        self.intermediate = ColumnParallelLinear(
+            hidden_size,
+            intermediate_size,
+            gather_output=False,
+            init_method=init_method)
+        self.intermediate_activation_fn = intermediate_activation_fn
+        # Output.
+        self.output = BertParallelTransformerOutput(
+            intermediate_size,
+            hidden_size,
+            output_dropout_prob,
+            layernorm_epsilon=layernorm_epsilon,
+            input_is_parallel=True,
+            init_method=init_method)
+
+    def forward(self, hidden_states, attention_mask):
+        # [b, s, hp]
+        attention_output_parallel = self.attention(hidden_states,
+                                                   attention_mask)
+        # [b, s, h]
+        attention_self_output = self.self_output(attention_output_parallel,
+                                                 hidden_states)
+        # [b, s, ip]
+        intermediate_output_parallel = self.intermediate(attention_self_output)
+        intermediate_output_parallel = self.intermediate_activation_fn(
+            intermediate_output_parallel)
+        # [b, s, h]
+        layer_output = self.output(intermediate_output_parallel,
+                                   attention_self_output)
+
+        return layer_output
diff --git a/modelscope/models/nlp/mglm/mpu/utils.py b/modelscope/models/nlp/mglm/mpu/utils.py
new file mode 100644
index 00000000..76c37a2b
--- /dev/null
+++ b/modelscope/models/nlp/mglm/mpu/utils.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+
+def ensure_divisibility(numerator, denominator):
+    """Ensure that numerator is divisible by the denominator."""
+    assert numerator % denominator == 0, '{} is not divisible by {}'.format(
+        numerator, denominator)
+
+
+def divide(numerator, denominator):
+    """Ensure that numerator is divisible by the denominator and return
+    the division value."""
+    ensure_divisibility(numerator, denominator)
+    return numerator // denominator
+
+
+def split_tensor_along_last_dim(tensor,
+                                num_partitions,
+                                contiguous_split_chunks=False):
+    """Split a tensor along its last dimension.
+    Arguments:
+        tensor: input tensor.
+        num_partitions: number of partitions to split the tensor
+        contiguous_split_chunks: If True, make each chunk contiguous
+                                 in memory.
+    """
+    # Get the size and dimension.
+    last_dim = tensor.dim() - 1
+    last_dim_size = divide(tensor.size()[last_dim], num_partitions)
+    # Split.
+    tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
+    # Note: torch.split does not create contiguous tensors by default.
+    if contiguous_split_chunks:
+        return tuple(chunk.contiguous() for chunk in tensor_list)
+
+    return tensor_list
+
+
+class VocabUtility:
+    """Split the vocabulary into `world_size` chunks amd return the
+        first and last index of the vocabulary belonging to the `rank`
+        partition: Note that indecies in [fist, last)"""
+
+    @staticmethod
+    def vocab_range_from_per_partition_vocab_size(per_partition_vocab_size,
+                                                  rank, world_size):
+        index_f = rank * per_partition_vocab_size
+        index_l = index_f + per_partition_vocab_size
+        return index_f, index_l
+
+    @staticmethod
+    def vocab_range_from_global_vocab_size(global_vocab_size, rank,
+                                           world_size):
+        per_partition_vocab_size = divide(global_vocab_size, world_size)
+        return VocabUtility.vocab_range_from_per_partition_vocab_size(
+            per_partition_vocab_size, rank, world_size)
diff --git a/modelscope/models/nlp/mglm/process_grid.py b/modelscope/models/nlp/mglm/process_grid.py
new file mode 100644
index 00000000..d425c970
--- /dev/null
+++ b/modelscope/models/nlp/mglm/process_grid.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2022 Zhipu.AI
+
+import glob
+import os
+import statistics
+import sys
+
+import json
+
+path_pattern = sys.argv[1]
+target_type = sys.argv[2]
+best_value, best_result, best_name = None, None, None
+mean_result = {}
+print(path_pattern)
+for dir_path in glob.glob(path_pattern, recursive=True):
+    entry = os.path.basename(dir_path)
+    valid_result = None
+    test_found = os.path.exists(os.path.join(dir_path, 'test_results.json'))
+    valid_path = os.path.join(dir_path, 'results.json')
+    if os.path.exists(valid_path):
+        print(entry)
+        with open(valid_path) as file:
+            valid_result = json.load(file)
+    else:
+        print(f'{entry} no validation results')
+        continue
+    if not test_found:
+        print(f'{entry} not tested yet')
+    if target_type == 'max':
+        metric = sys.argv[3]
+        metric_value = valid_result[metric]
+        if best_value is None or metric_value > best_value:
+            best_value = metric_value
+            best_result = valid_result
+            best_name = entry
+    elif target_type == 'mean' or target_type == 'median':
+        if mean_result:
+            for metric, value in valid_result.items():
+                if metric not in ['type', 'epoch']:
+                    mean_result[metric].append(value)
+        else:
+            mean_result = {
+                metric: [value]
+                for metric, value in valid_result.items()
+                if metric not in ['type', 'epoch']
+            }
+
+if target_type == 'max':
+    print(f'Best result found at {best_name}: {best_result}')
+elif target_type == 'mean':
+    mean_result = {
+        metric: sum(value) / len(value)
+        for metric, value in mean_result.items()
+    }
+    print(f'Mean result {mean_result}')
+elif target_type == 'median':
+    mean_result = {
+        metric: statistics.median(value)
+        for metric, value in mean_result.items()
+    }
+    print(f'Mean result {mean_result}')
diff --git a/modelscope/models/nlp/mglm/requirements.txt b/modelscope/models/nlp/mglm/requirements.txt
new file mode 100644
index 00000000..e44ae5d1
--- /dev/null
+++ b/modelscope/models/nlp/mglm/requirements.txt
@@ -0,0 +1,22 @@
+boto3
+botocore
+deepspeed
+fasttext
+filelock
+ftfy
+langdetect
+lsh
+matplotlib
+mpi4py
+nltk
+pandas
+regex
+requests
+rouge_score
+scikit_learn
+scipy
+sentencepiece
+termcolor
+tldextract
+tqdm
+transformers
diff --git a/modelscope/models/nlp/mglm/run_test.py b/modelscope/models/nlp/mglm/run_test.py
new file mode 100644
index 00000000..2f568265
--- /dev/null
+++ b/modelscope/models/nlp/mglm/run_test.py
@@ -0,0 +1,10 @@
+# Copyright (c) 2022 Zhipu.AI
+
+import sys
+
+if sys.argv[1] == 'block':
+    from test.test_block import main
+    main()
+elif sys.argv[1] == 'rel_shift':
+    from test.test_rel_shift import main
+    main()
diff --git a/modelscope/models/nlp/mglm/tasks/data_utils.py b/modelscope/models/nlp/mglm/tasks/data_utils.py
new file mode 100644
index 00000000..179d304e
--- /dev/null
+++ b/modelscope/models/nlp/mglm/tasks/data_utils.py
@@ -0,0 +1,389 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Tasks data utility."""
+import copy
+import pickle
+import re
+from typing import Dict, List, Optional
+
+import json
+import numpy as np
+import torch
+import torch.utils.data
+from torch.utils.data.dataloader import default_collate
+
+from modelscope.models.nlp.mglm import mpu
+
+
+def clean_text(text):
+    """Remove new lines and multiple spaces and adjust end of sentence dot."""
+
+    text = text.replace('\n', ' ')
+    text = re.sub(r'\s+', ' ', text)
+    for _ in range(3):
+        text = text.replace(' . ', '. ')
+
+    return text
+
+
+class InputExample(object):
+    """A raw input example consisting of one or two segments of text and a label"""
+
+    def __init__(self,
+                 guid,
+                 text_a,
+                 text_b=None,
+                 label=None,
+                 logits=None,
+                 meta: Optional[Dict] = None,
+                 idx=-1,
+                 num_choices=1):
+        """
+        Create a new InputExample.
+
+        :param guid: a unique textual identifier
+        :param text_a: the sequence of text
+        :param text_b: an optional, second sequence of text
+        :param label: an optional label
+        :param logits: an optional list of per-class logits
+        :param meta: an optional dictionary to store arbitrary meta information
+        :param idx: an optional numeric index
+        """
+        self.guid = guid
+        self.text_a = text_a
+        self.text_b = text_b
+        self.label = label
+        self.logits = logits
+        self.idx = idx
+        self.num_choices = num_choices
+        self.meta = meta if meta else {}
+
+    def __repr__(self):
+        return str(self.to_json_string())
+
+    def to_dict(self):
+        """Serialize this instance to a Python dictionary."""
+        output = copy.deepcopy(self.__dict__)
+        return output
+
+    def to_json_string(self):
+        """Serialize this instance to a JSON string."""
+        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + '\n'
+
+    @staticmethod
+    def load_examples(path: str) -> List['InputExample']:
+        """Load a set of input examples from a file"""
+        with open(path, 'rb') as fh:
+            return pickle.load(fh)
+
+    @staticmethod
+    def save_examples(examples: List['InputExample'], path: str) -> None:
+        """Save a set of input examples to a file"""
+        with open(path, 'wb') as fh:
+            pickle.dump(examples, fh)
+
+
+def num_special_tokens_to_add(text_a_ids,
+                              text_b_ids,
+                              answer_ids,
+                              add_cls,
+                              add_sep,
+                              add_piece,
+                              add_eos=True):
+    num_tokens = 0
+    if add_cls:
+        num_tokens += 1
+    if text_b_ids and add_sep:
+        num_tokens += 1
+    if add_eos:
+        num_tokens += 1
+    if not answer_ids and add_piece:
+        num_tokens += 1
+    return num_tokens
+
+
+def build_input_from_ids(text_a_ids,
+                         text_b_ids,
+                         answer_ids,
+                         max_seq_length,
+                         tokenizer,
+                         args=None,
+                         add_cls=True,
+                         add_sep=False,
+                         add_piece=False,
+                         add_eos=True,
+                         mask_id=None):
+    if mask_id is None:
+        mask_id = tokenizer.get_command('MASK').Id
+    eos_id = tokenizer.get_command('eos').Id
+    cls_id = tokenizer.get_command('ENC').Id
+    sep_id = tokenizer.get_command('sep').Id
+    ids = []
+    types = []
+    paddings = []
+    # CLS
+    if add_cls:
+        ids.append(cls_id)
+        types.append(0)
+        paddings.append(1)
+    # A
+    len_text_a = len(text_a_ids)
+    ids.extend(text_a_ids)
+    types.extend([0] * len_text_a)
+    paddings.extend([1] * len_text_a)
+    # B
+    if text_b_ids is not None:
+        # SEP
+        if add_sep:
+            ids.append(sep_id)
+            types.append(0)
+            paddings.append(1)
+        len_text_b = len(text_b_ids)
+        ids.extend(text_b_ids)
+        types.extend([1] * len_text_b)
+        paddings.extend([1] * len_text_b)
+    eos_length = 1 if add_eos else 0
+    # Cap the size.
+    if len(ids) >= max_seq_length - eos_length:
+        max_seq_length_m1 = max_seq_length - 1
+        ids = ids[0:max_seq_length_m1]
+        types = types[0:max_seq_length_m1]
+        paddings = paddings[0:max_seq_length_m1]
+    end_type = 0 if text_b_ids is None else 1
+    if add_eos:
+        ids.append(eos_id)
+        types.append(end_type)
+        paddings.append(1)
+    sep = len(ids)
+    target_ids = [0] * len(ids)
+    loss_masks = [0] * len(ids)
+    position_ids = list(range(len(ids)))
+    block_position_ids = [0] * len(ids)
+    # Piece
+    if add_piece or answer_ids is not None:
+        sop_id = tokenizer.get_command('sop').Id
+        mask_position = ids.index(
+            mask_id
+        ) if not args.sentinel_token else args.max_position_embeddings
+        ids.append(sop_id)
+        types.append(end_type)
+        paddings.append(1)
+        position_ids.append(mask_position)
+        block_position_ids.append(1)
+        if answer_ids is not None:
+            len_answer = len(answer_ids)
+            ids.extend(answer_ids[:-1])
+            types.extend([end_type] * (len_answer - 1))
+            paddings.extend([1] * (len_answer - 1))
+            position_ids.extend([mask_position] * (len_answer - 1))
+            if not args.no_block_position:
+                block_position_ids.extend(range(2, len(answer_ids) + 1))
+            else:
+                block_position_ids.extend([1] * (len(answer_ids) - 1))
+            target_ids.extend(answer_ids)
+            loss_masks.extend([1] * len(answer_ids))
+        else:
+            target_ids.append(0)
+            loss_masks.append(1)
+    # Padding.
+    padding_length = max_seq_length - len(ids)
+    if padding_length > 0:
+        ids.extend([eos_id] * padding_length)
+        types.extend([eos_id] * padding_length)
+        paddings.extend([0] * padding_length)
+        position_ids.extend([0] * padding_length)
+        block_position_ids.extend([0] * padding_length)
+        target_ids.extend([0] * padding_length)
+        loss_masks.extend([0] * padding_length)
+    if not args.masked_lm:
+        position_ids = [position_ids, block_position_ids]
+    return ids, types, paddings, position_ids, sep, target_ids, loss_masks
+
+
+def build_decoder_input(enc_ids, answer_ids, max_seq_length,
+                        max_dec_seq_length, tokenizer):
+    mask_id = tokenizer.get_command('MASK').Id
+    eos_id = tokenizer.get_command('eos').Id
+    sop_id = tokenizer.get_command('sop').Id
+    enc_len = len(enc_ids)  # noqa
+    masks = []
+    # TODO: it probably takes too much memory
+    # for i in range(max_dec_seq_length):
+    #     m = [1]*enc_len + [0]*(max_seq_length - enc_len) + [1]*(i+1) + [0]*(max_dec_seq_length-1-i)
+    #     masks.append(m)
+    mask_position = enc_ids.index(mask_id)
+    len_answer = len(answer_ids)
+    ids = [sop_id] + answer_ids[:-1]
+    types = [0] * len_answer  # not used
+    paddings = [1] * len_answer
+    position_ids = [mask_position] * len_answer
+    block_position_ids = list(range(1, len_answer + 1))
+    target_ids = answer_ids
+    loss_masks = [1] * len_answer
+    # Padding.
+    padding_length = max_dec_seq_length - len(ids)
+    if padding_length > 0:
+        ids.extend([eos_id] * padding_length)
+        types.extend([0] * padding_length)
+        paddings.extend([0] * padding_length)
+        position_ids.extend([0] * padding_length)
+        block_position_ids.extend([0] * padding_length)
+        target_ids.extend([0] * padding_length)
+        loss_masks.extend([0] * padding_length)
+    position_ids = [position_ids, block_position_ids]
+    return ids, types, paddings, position_ids, masks, target_ids, loss_masks
+
+
+def build_sample(ids,
+                 types=None,
+                 paddings=None,
+                 positions=None,
+                 masks=None,
+                 label=None,
+                 unique_id=None,
+                 target=None,
+                 logit_mask=None,
+                 segment_ids=None,
+                 prompt_ids=None):
+    """Convert to numpy and return a sample consumed by the batch producer."""
+
+    ids_np = np.array(ids, dtype=np.int64)
+    sample = {'text': ids_np, 'label': int(label)}
+    if types is not None:
+        types_np = np.array(types, dtype=np.int64)
+        sample['types'] = types_np
+    if paddings is not None:
+        paddings_np = np.array(paddings, dtype=np.int64)
+        sample['padding_mask'] = paddings_np
+    if positions is not None:
+        positions_np = np.array(positions, dtype=np.int64)
+        sample['position'] = positions_np
+    if masks is not None:
+        masks_np = np.array(masks, dtype=np.int64)
+        sample['mask'] = masks_np
+    if target is not None:
+        target_np = np.array(target, dtype=np.int64)
+        sample['target'] = target_np
+    if logit_mask is not None:
+        logit_mask_np = np.array(logit_mask, dtype=np.int64)
+        sample['logit_mask'] = logit_mask_np
+    if segment_ids is not None:
+        segment_ids = np.array(segment_ids, dtype=np.int64)
+        sample['segment_id'] = segment_ids
+    if prompt_ids is not None:
+        prompt_ids = np.array(prompt_ids, dtype=np.int64)
+        sample['prompt_pos'] = prompt_ids
+    if unique_id is not None:
+        sample['uid'] = unique_id
+    return sample
+
+
+def build_decoder_sample(sample, dec_ids, dec_position, dec_masks, dec_target,
+                         dec_logit_mask):
+    sample['dec_text'] = np.array(dec_ids)
+    sample['dec_position'] = np.array(dec_position)
+    sample['dec_mask'] = np.array(dec_masks)
+    sample['dec_target'] = np.array(dec_target)
+    sample['dec_logit_mask'] = np.array(dec_logit_mask)
+    return sample
+
+
+def my_collate(batch):
+    new_batch = [{key: value
+                  for key, value in sample.items() if key != 'uid'}
+                 for sample in batch]
+    text_list = [sample['text'] for sample in batch]
+
+    def pad_choice_dim(data, choice_num):
+        if len(data) < choice_num:
+            data = np.concatenate([data]
+                                  + [data[0:1]] * (choice_num - len(data)))
+        return data
+
+    if len(text_list[0].shape) == 2:
+        choice_nums = list(map(len, text_list))
+        max_choice_num = max(choice_nums)
+        for i, sample in enumerate(new_batch):
+            for key, value in sample.items():
+                if key != 'label':
+                    sample[key] = pad_choice_dim(value, max_choice_num)
+                else:
+                    sample[key] = value
+            sample['loss_mask'] = np.array(
+                [1] * choice_nums[i] + [0] * (max_choice_num - choice_nums[i]),
+                dtype=np.int64)
+
+    if 'dec_text' in new_batch[0]:
+        choice_nums = [len(sample['dec_text']) for sample in new_batch]
+        if choice_nums.count(choice_nums[0]) != len(choice_nums):
+            max_choice_num = max(choice_nums)
+            for i, sample in enumerate(new_batch):
+                for key, value in sample.items():
+                    if key.startswith('dec_'):
+                        sample[key] = pad_choice_dim(value, max_choice_num)
+                sample['loss_mask'] = np.array(
+                    [1] * choice_nums[i] + [0] *  # noqa
+                    (max_choice_num - choice_nums[i]),
+                    dtype=np.int64)
+
+    new_batch = default_collate(new_batch)
+    if 'uid' in batch[0]:
+        uid_list = [sample['uid'] for sample in batch]
+        new_batch['uid'] = uid_list
+    return new_batch
+
+
+class FakeDataloader:
+
+    def __init__(self, num_iters):
+        self.num_iters = num_iters
+
+    def __iter__(self):
+        if self.num_iters is not None:
+            for _ in range(self.num_iters):
+                yield None
+        else:
+            while True:
+                yield None
+
+
+def build_data_loader(dataset,
+                      batch_size,
+                      num_workers,
+                      drop_last,
+                      shuffle=True,
+                      only_rank0=False):
+    """Data loader. Note that batch-size is the local (per GPU) batch-size."""
+
+    # Sampler.
+    if only_rank0:
+        rank, world_size = 0, 1
+    else:
+        world_size = mpu.get_data_parallel_world_size()
+        rank = mpu.get_data_parallel_rank()
+    sampler = torch.utils.data.distributed.DistributedSampler(
+        dataset, num_replicas=world_size, rank=rank, shuffle=shuffle)
+
+    # Data loader. Note that batch size is the per GPU batch size.
+    data_loader = torch.utils.data.DataLoader(
+        dataset,
+        batch_size=batch_size,
+        sampler=sampler,
+        shuffle=False,
+        num_workers=num_workers,
+        drop_last=drop_last,
+        pin_memory=True,
+        collate_fn=my_collate)
+
+    return data_loader
diff --git a/modelscope/models/nlp/mglm/tasks/eval_utils.py b/modelscope/models/nlp/mglm/tasks/eval_utils.py
new file mode 100644
index 00000000..da23a884
--- /dev/null
+++ b/modelscope/models/nlp/mglm/tasks/eval_utils.py
@@ -0,0 +1,249 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Evaluation utilities."""
+
+import datetime
+import os
+import random
+import time
+from collections import OrderedDict
+from typing import List
+
+import mpu
+import torch
+from finetune_glm import process_batch
+from sklearn.metrics import f1_score
+from tasks.data_utils import InputExample, build_data_loader
+from utils import debug_finetune_data, get_spare_port, print_rank_0
+
+
+def accuracy_metric(predictions, labels, examples):
+    count = 0
+    num_predictions = max(len(predictions), 1)
+    assert len(predictions) == len(labels)
+    for prediction, label in zip(predictions, labels):
+        count += prediction == label
+    return count * 100.0 / num_predictions
+
+
+def f1_metric(predictions, labels, examples):
+    return f1_score(labels, predictions)
+
+
+def f1_macro_metric(predictions, labels, examples):
+    return f1_score(labels, predictions, average='macro')
+
+
+global_tokenizer = None
+
+
+def accuracy_func_provider(single_dataset_provider,
+                           metric_dict,
+                           args,
+                           is_test=False,
+                           eval_func=None,
+                           output_func=None,
+                           only_rank0=True,
+                           tokenizer=None):
+    """Provide function that calculates accuracies."""
+    # Build dataloaders.
+    global global_tokenizer
+    global_tokenizer = tokenizer
+    if only_rank0 and torch.distributed.is_initialized(
+    ) and torch.distributed.get_rank() != 0:
+        return None
+    if is_test and not args.eval_valid:
+        datapaths = args.test_data if args.test_data is not None else ['test']
+    else:
+        datapaths = args.valid_data if args.valid_data is not None else ['dev']
+    if eval_func is None:
+        eval_func = multichoice_evaluate
+    dataloaders = []
+    eval_batch_size = args.eval_batch_size if args.eval_batch_size else args.batch_size
+    for datapath in datapaths:
+        dataset = single_dataset_provider(datapath)
+        dataloader = build_data_loader(
+            dataset,
+            eval_batch_size,
+            num_workers=args.num_workers,
+            drop_last=False,
+            shuffle=False,
+            only_rank0=only_rank0)
+        dataloaders.append((dataset.dataset_name, dataloader))
+
+    def metrics_func(model,
+                     epoch,
+                     output_predictions=False,
+                     summary_writer=None):
+        print_rank_0('calculating metrics ...')
+        score_dict = OrderedDict([(key, 0.0) for key in metric_dict
+                                  ]) if isinstance(metric_dict, dict) else {
+                                      metric_dict: 0.0
+                                  }  # noqa
+        total = 0
+        for name, dataloader in dataloaders:
+            example_dict = None
+            if hasattr(dataloader.dataset, 'examples'):
+                example_dict = dataloader.dataset.examples
+            start_time = time.time()
+            predictions, labels, examples = eval_func(model, dataloader,
+                                                      example_dict, args)
+            elapsed_time = time.time() - start_time
+            if output_predictions and torch.distributed.get_rank() == 0:
+                filename = os.path.join(args.log_dir, name + '.jsonl')
+                output_func(predictions, examples, filename)
+            total_count = len(predictions)
+            single_dict = {
+                key: metric(predictions, labels, examples)
+                for key, metric in metric_dict.items()
+            }
+            output_str = ' > |epoch: {}| metrics for {}: total {}'.format(
+                epoch, name, total_count)
+            for key, value in single_dict.items():
+                output_str += ' {} = {:.4f} %'.format(key, value)
+                if summary_writer is not None and epoch >= 0 and not is_test and len(
+                        dataloaders) > 1:
+                    summary_writer.add_scalar(f'Train/valid_{name}_{key}',
+                                              value, epoch)
+            output_str += ' elapsed time (sec): {:.3f}'.format(elapsed_time)
+            if len(dataloaders) > 1:
+                print_rank_0(output_str)
+            for key in score_dict:
+                score_dict[key] += single_dict[key] * total_count
+            total += total_count
+        score_dict = {
+            key: score / float(total)
+            for key, score in score_dict.items()
+        }
+        output_str = ' >> |epoch: {}| overall: total = {}'.format(epoch, total)
+        for key, score in score_dict.items():
+            output_str += ' {} = {:.4f}'.format(key, score)
+            if summary_writer is not None and epoch >= 0 and not is_test:
+                summary_writer.add_scalar(f'Train/valid_{key}', score, epoch)
+        print_rank_0(output_str)
+        return score_dict
+
+    return metrics_func
+
+
+segment_length = 10
+
+
+def multichoice_evaluate(model, dataloader, example_dict, args):
+    """Calculate correct over total answers and return prediction if the
+    `output_predictions` is true."""
+    model.eval()
+    port = get_spare_port(args)
+    print_rank_0(f'Using port {port}')
+    store = torch.distributed.TCPStore(args.master_ip, port,
+                                       torch.distributed.get_world_size(),
+                                       torch.distributed.get_rank() == 0,
+                                       datetime.timedelta(seconds=30))
+    # file_path = os.path.join("/cache", args.experiment_name + "_store")
+    # print_rank_0(f"Using file store at {file_path}")
+    # store = torch.distributed.FileStore(file_path, torch.distributed.get_world_size())
+    with torch.no_grad():
+        # For all the batches in the dataset.
+        for _, batch in enumerate(dataloader):
+            # Run the model forward.
+            data = process_batch(batch, args)
+            if args.pretrained_bert:
+                tokens, types, labels_, attention_mask = data['text'], data[
+                    'types'], data['label'], data['padding_mask']
+                inputs = [tokens, types, attention_mask]
+            elif args.cloze_eval:
+                tokens, labels_, position_ids = data['text'], data[
+                    'label'], data['position']
+                attention_mask, target_ids, logit_mask = data['mask'], data[
+                    'target'], data['logit_mask']
+                if not args.fast_decode:
+                    inputs = [
+                        tokens, position_ids, attention_mask, target_ids,
+                        logit_mask
+                    ]
+                    if args.continuous_prompt:
+                        prompt_pos = data['prompt_pos']
+                        inputs.append(prompt_pos)
+                else:
+                    dec_input_ids, dec_position_ids, dec_attention_mask = data[
+                        'dec_text'], data['dec_position'], data['dec_mask']
+                    dec_target_ids, dec_logit_mask = data['dec_target'], data[
+                        'dec_logit_mask']
+                    inputs = [
+                        tokens, position_ids, attention_mask, dec_input_ids,
+                        dec_position_ids, dec_attention_mask, dec_target_ids,
+                        dec_logit_mask
+                    ]
+            else:
+                tokens, labels_, position_ids, attention_mask = data[
+                    'text'], data['label'], data['position'], data['mask']
+                inputs = [tokens, position_ids, attention_mask]
+            if len(inputs[0].shape
+                   ) == 3 and inputs[0].size(1) > segment_length:
+                logit_list = []
+                for i in range((inputs[0].size(1) - 1) // segment_length + 1):
+                    input_batch = [
+                        arg[:, i * segment_length:(i + 1) * segment_length]
+                        for arg in inputs
+                    ]
+                    if args.pretrained_bert:
+                        logits = model(*input_batch)
+                    else:
+                        logits, *mems = model(*input_batch)
+                    logit_list.append(logits)
+                logits = torch.cat(logit_list, dim=1)
+            elif args.cloze_eval and args.fast_decode:
+                logit_list = []
+                num_choices = inputs[3].size(1)
+                for i in range((num_choices - 1) // segment_length + 1):
+                    input_batch = inputs[:3] + [
+                        arg[:, i * segment_length:(i + 1) * segment_length]
+                        for arg in inputs[3:]
+                    ]
+                    logits, *mems = model(*input_batch)
+                    logit_list.append(logits)
+                logits = torch.cat(logit_list, dim=1)
+            else:
+                if args.pretrained_bert:
+                    logits = model(*inputs)
+                else:
+                    logits, *mems = model(*inputs)
+            if 'segment_id' in data:
+                from torch_scatter import scatter_sum
+                if 'loss_mask' in data:
+                    logits = logits * data['loss_mask']
+                logits = scatter_sum(logits, data['segment_id'], dim=1)
+            elif 'loss_mask' in data:
+                loss_mask = data['loss_mask']
+                logits = logits * loss_mask - 10000.0 * (1.0 - loss_mask)
+            uid_list = batch['uid']
+            if isinstance(uid_list, torch.Tensor):
+                uid_list = uid_list.cpu().numpy().tolist()
+            predicted = torch.argmax(logits, dim=-1).tolist()
+            labels = labels_.tolist()
+            if args.task.lower() == 'wsc':
+                predicted = [1 if pred == 0 else 0 for pred in predicted]
+            if mpu.get_model_parallel_rank() == 0:
+                for uid, prediction, label in zip(uid_list, predicted, labels):
+                    store.set(uid, str((prediction, label)))
+    model.train()
+    torch.distributed.barrier()
+    predictions, labels, examples = [], [], []
+    for uid, example in example_dict.items():
+        prediction, label = eval(store.get(uid))
+        predictions.append(prediction)
+        labels.append(label)
+        examples.append(example)
+    torch.distributed.barrier()
+    return predictions, labels, examples
diff --git a/modelscope/models/nlp/mglm/tasks/language_model/dataset.py b/modelscope/models/nlp/mglm/tasks/language_model/dataset.py
new file mode 100644
index 00000000..cfdfa714
--- /dev/null
+++ b/modelscope/models/nlp/mglm/tasks/language_model/dataset.py
@@ -0,0 +1,249 @@
+# Copyright (c) 2022 Zhipu.AI
+
+import math
+from bisect import bisect_right
+from itertools import accumulate
+
+import json
+import numpy as np
+import torch
+from tasks.data_utils import build_input_from_ids, num_special_tokens_to_add
+from tasks.language_model.detokenizer import get_detokenizer
+from utils import print_rank_0
+
+
+class LMDataset(torch.utils.data.Dataset):
+
+    def __init__(self, args, documents, tokenizer, num_original_tokens,
+                 num_tokenized_tokens):
+        self.args = args
+        self.documents = documents
+        self.max_seq_len = args.seq_length - 1
+        self.tokenizer = tokenizer
+        self.overalapping_eval = args.overlapping_eval
+        if self.overalapping_eval is None:
+            self.overalapping_eval = self.max_seq_len
+        self.overalapping_eval = max(1, self.overalapping_eval)
+        self.num_original_tokens = num_original_tokens
+        self.num_tokenized_tokens = num_tokenized_tokens
+        # remove first sequence tokens
+        targets = [
+            max(len(tokens) - self.max_seq_len, 0) for tokens in self.documents
+        ]
+        self.num_sequences = [
+            max(math.ceil(target / self.overalapping_eval) + 1, 1)
+            for target in targets
+        ]
+        self.weights = list(accumulate(self.num_sequences))
+        self.left_weights = [0] + self.weights[:-1]
+        self.unidirectional = args.unidirectional
+        self.block_lm = args.block_lm
+        mask_token = 'gMASK' if args.task_mask else 'MASK'
+        self.mask_id = self.tokenizer.get_command(mask_token).Id
+
+    def __len__(self):
+        return sum(self.num_sequences)
+
+    def __getitem__(self, idx):
+        document_idx = bisect_right(self.weights, idx)
+        idx = idx - self.left_weights[document_idx]
+        start_idx = idx * self.overalapping_eval
+        end_idx = start_idx + self.max_seq_len
+        tokens = self.documents[document_idx][start_idx:end_idx]
+        if self.block_lm:
+            if idx == 0 or self.unidirectional:
+                prompt, text = tokens[:1], tokens[1:]
+            else:
+                prompt_length = self.max_seq_len - self.overalapping_eval
+                prompt, text = tokens[:prompt_length], tokens[prompt_length:]
+            prompt = prompt + [self.mask_id]
+            num_special_tokens = num_special_tokens_to_add(
+                prompt,
+                None,
+                text,
+                add_cls=True,
+                add_sep=False,
+                add_piece=True,
+                add_eos=False)
+            data = build_input_from_ids(
+                prompt,
+                None,
+                text,
+                self.max_seq_len + num_special_tokens + 1,
+                self.tokenizer,
+                args=self.args,
+                add_cls=True,
+                add_sep=False,
+                add_piece=True,
+                add_eos=False,
+                mask_id=self.mask_id)
+            ids, types, paddings, position_ids, sep, target_ids, loss_masks = data
+            if idx != 0 and self.unidirectional:
+                loss_masks = np.array(loss_masks, dtype=np.int64)
+                loss_masks[:-self.overalapping_eval] = 0
+            return {
+                'text': np.array(ids, dtype=np.int64),
+                'target': np.array(target_ids, dtype=np.int64),
+                'attention_mask': np.array(sep, dtype=np.int64),
+                'loss_mask': np.array(loss_masks, dtype=np.int64),
+                'position_id': np.array(position_ids, dtype=np.int64)
+            }
+        else:
+            loss_masks = [1] * len(tokens)
+            if len(tokens) < self.max_seq_len:
+                tokens = tokens + [0] * (self.max_seq_len - len(tokens))
+                loss_masks = loss_masks + [0] * (
+                    self.max_seq_len - len(loss_masks))
+            if idx != 0:
+                loss_masks = np.array(loss_masks, dtype=np.int64)
+                loss_masks[:-self.overalapping_eval] = 0
+            return {
+                'text': np.array(tokens, dtype=np.int64),
+                'loss_mask': np.array(loss_masks, dtype=np.int64)
+            }
+
+
+class LambadaDataset(torch.utils.data.Dataset):
+
+    def __init__(self, args, tokenizer, strict=True):
+        data_path = args.valid_data[0]
+        print_rank_0(
+            '> building lambada dataset from {} ...'.format(data_path))
+        self.args = args
+        self.max_seq_length = args.seq_length
+        self.tokenizer = tokenizer
+        self.pad_idx = tokenizer.get_command('pad').Id
+        self.strict = strict
+        self.block_lm = args.block_lm
+        self.unidirectional = args.unidirectional
+        mask_token = 'gMASK' if args.task_mask else 'MASK'
+        self.mask_id = self.tokenizer.get_command(mask_token).Id
+
+        self.tokens = []
+        self.labels = []
+        with open(data_path, 'r') as f:
+            for line in f.readlines():
+                text = json.loads(line)['text']
+                tokens, labels = self.get_tokens(text)
+                self.tokens.append(tokens)
+                self.labels.append(labels)
+
+    def get_tokens(self, text):
+        if not self.strict:
+            tokens = self.tokenizer.EncodeAsIds(text).tokenization
+            return tokens[:-1], [tokens[-1]]
+        last_token = text.split()[-1]
+        start_idx = text.rfind(last_token)
+        beginning_tokens = self.tokenizer.EncodeAsIds(
+            text[:start_idx].strip()).tokenization
+        last_token = self.tokenizer.EncodeAsIds(' ' + last_token).tokenization
+        return beginning_tokens, last_token
+
+    def __len__(self):
+        return len(self.tokens)
+
+    def __getitem__(self, idx):
+        tokens, answer = self.tokens[idx], self.labels[idx]
+        if self.block_lm:
+            if self.unidirectional:
+                tokens, answer_tokens = tokens[:1], tokens[1:] + answer
+            else:
+                answer_tokens = answer
+            tokens = tokens + [self.mask_id]
+            num_special_tokens = num_special_tokens_to_add(
+                tokens,
+                None,
+                answer_tokens,
+                add_cls=True,
+                add_sep=False,
+                add_piece=True)
+            left_shift = len(tokens) + len(
+                answer_tokens) + num_special_tokens - self.max_seq_length
+            if left_shift > 0:
+                tokens = tokens[left_shift:]
+            data = build_input_from_ids(
+                tokens,
+                None,
+                answer_tokens,
+                self.max_seq_length,
+                self.tokenizer,
+                args=self.args,
+                add_cls=True,
+                add_sep=False,
+                add_piece=True,
+                mask_id=self.mask_id)
+            ids, types, paddings, position_ids, sep, target_ids, loss_masks = data
+            if self.unidirectional:
+                loss_masks = np.array(loss_masks, dtype=np.int64)
+                last_index = len(loss_masks)
+                while loss_masks[last_index - 1] == 0:
+                    last_index -= 1
+                loss_masks[:last_index - len(answer)] = 0
+            return {
+                'text': np.array(ids, dtype=np.int64),
+                'target': np.array(target_ids, dtype=np.int64),
+                'attention_mask': np.array(sep, dtype=np.int64),
+                'loss_mask': np.array(loss_masks, dtype=np.int64),
+                'position_id': np.array(position_ids, dtype=np.int64)
+            }
+        else:
+            left_shift = len(tokens) - self.max_seq_length
+            if left_shift > 0:
+                tokens = tokens[left_shift:]
+            ids = tokens + answer
+            if len(ids) < self.max_seq_length:
+                ids = ids + [0] * (self.max_seq_length - len(ids))
+            loss_masks = [0] * len(tokens) + [1] * len(answer)
+            if len(loss_masks) < self.max_seq_length:
+                loss_masks = loss_masks + [0] * (
+                    self.max_seq_length - len(loss_masks))
+            return {
+                'text': np.array(ids, dtype=np.int64),
+                'loss_mask': np.array(loss_masks, dtype=np.int64)
+            }
+
+
+def build_lambada_dataset(tokenizer, args):
+    """Build lambada dataset."""
+    assert len(args.valid_data) == 1
+    val_dataset = LambadaDataset(args, tokenizer, strict=True)
+    print_rank_0(' > found {} samples, {} label tokens.'.format(
+        len(val_dataset), sum(map(len, val_dataset.labels))))
+    return val_dataset
+
+
+def build_lm_dataset(tokenizer, args):
+    documents = []
+    num_tokens, num_original_tokens = 0, 0
+    with open(args.valid_data[0], encoding='utf-8') as file:
+        for line in file:
+            tokens = tokenizer.EncodeAsIds(line.strip()).tokenization
+            num_tokens += len(tokens)
+            num_original_tokens += len(line.strip().split(' '))
+            documents.append(tokens)
+    val_dataset = LMDataset(args, documents, tokenizer, num_original_tokens,
+                            num_tokens)
+    print_rank_0(
+        ' > number of document: {}, number of original tokens {}, number of detokenized tokens: {}'
+        .format(len(documents), num_original_tokens, num_tokens))
+    return val_dataset
+
+
+def build_wikitext103_dataset(tokenizer, args):
+    """"""
+
+    assert len(args.valid_data) == 1
+    with open(args.valid_data[0], 'rb') as reader:
+        entire_data = reader.read().decode('utf-8')
+    num_original_tokens = len(entire_data.strip().split(' '))
+    entire_data = get_detokenizer('wikitext')(entire_data)
+    print_rank_0(entire_data[:1024])
+    tokenized_data = tokenizer.EncodeAsIds(entire_data).tokenization
+    num_tokenized_tokens = len(tokenized_data)
+
+    val_dataset = LMDataset(args, [tokenized_data], tokenizer,
+                            num_original_tokens, num_tokenized_tokens)
+    print_rank_0(' > number of original tokens: {}, number of detokenized '
+                 'tokens: {}'.format(num_original_tokens,
+                                     num_tokenized_tokens))
+    return val_dataset
diff --git a/modelscope/models/nlp/mglm/tasks/language_model/detokenizer.py b/modelscope/models/nlp/mglm/tasks/language_model/detokenizer.py
new file mode 100755
index 00000000..dc1524de
--- /dev/null
+++ b/modelscope/models/nlp/mglm/tasks/language_model/detokenizer.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2022 Zhipu.AI
+
+import re
+
+
+def ptb_detokenizer(string):
+    string = string.replace(" '", "'")
+    string = string.replace(' \n', '\n')
+    string = string.replace('\n ', '\n')
+    string = string.replace(" n't", "n't")
+    string = string.replace(' N ', '1 ')
+    string = string.replace('$ 1', '$1')
+    string = string.replace('# 1', '#1')
+    return string
+
+
+def wikitext_detokenizer(string):
+    # contractions
+    string = string.replace("s '", "s'")
+    string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string)
+    # number separators
+    string = string.replace(' @-@ ', '-')
+    string = string.replace(' @,@ ', ',')
+    string = string.replace(' @.@ ', '.')
+    # punctuation
+    string = string.replace(' : ', ': ')
+    string = string.replace(' ; ', '; ')
+    string = string.replace(' . ', '. ')
+    string = string.replace(' ! ', '! ')
+    string = string.replace(' ? ', '? ')
+    string = string.replace(' , ', ', ')
+    # double brackets
+    string = re.sub(r'\(\s*([^\)]*?)\s*\)', r'(\1)', string)
+    string = re.sub(r'\[\s*([^\]]*?)\s*\]', r'[\1]', string)
+    string = re.sub(r'{\s*([^}]*?)\s*}', r'{\1}', string)
+    string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string)
+    string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string)
+    # miscellaneous
+    string = string.replace('= = = =', '====')
+    string = string.replace('= = =', '===')
+    string = string.replace('= =', '==')
+    string = string.replace(' ' + chr(176) + ' ', chr(176))
+    string = string.replace(' \n', '\n')
+    string = string.replace('\n ', '\n')
+    string = string.replace(' N ', ' 1 ')
+    string = string.replace(" 's", "'s")
+
+    return string
+
+
+def lambada_detokenizer(string):
+    return string
+
+
+def get_detokenizer(dataset):
+    return DETOKENIZERS[dataset]
+
+
+DETOKENIZERS = {
+    'ptb': ptb_detokenizer,
+    'wikitext': wikitext_detokenizer,
+    'lambada': lambada_detokenizer,
+}
diff --git a/modelscope/models/nlp/mglm/tasks/language_model/finetune.py b/modelscope/models/nlp/mglm/tasks/language_model/finetune.py
new file mode 100644
index 00000000..b6089e6f
--- /dev/null
+++ b/modelscope/models/nlp/mglm/tasks/language_model/finetune.py
@@ -0,0 +1,254 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""GPT2 zero-shot evaluation."""
+
+import functools
+import math
+
+import mpu
+import torch
+from finetune_glm import finetune
+from pretrain_glm import get_batch
+from tasks.data_utils import build_data_loader
+from tasks.language_model.dataset import (build_lambada_dataset,
+                                          build_lm_dataset,
+                                          build_wikitext103_dataset)
+from utils import print_rank_0
+
+global_tokenizer = None
+
+
+def lm_forward_step(data, model, args, timers, mems, eval_metric=None):
+    """Forward step."""
+
+    # Get the batch.
+    if timers is not None:
+        timers('batch generator').start()
+    if 'mask' in data:
+        data['attention_mask'] = data.pop('mask')
+    tokens, labels, loss_mask, attention_mask, position_ids = get_batch(
+        data, args)
+    if timers is not None:
+        timers('batch generator').stop()
+
+    def print_masked_text(batch_id):
+        block_position_ids = position_ids[:, 1]
+        position_ids_ = position_ids[:, 0]
+        output_tokens = []
+        sep = attention_mask[batch_id].item()
+        for i, token in enumerate(tokens[batch_id, :sep].tolist()):
+            if global_tokenizer is not None:
+                token = global_tokenizer.IdToToken(token)
+                if token.startswith('[MASK'):
+                    token = f'[{position_ids_[batch_id, i].item()}, {token}]'
+                if token.startswith('##') and len(
+                        output_tokens) > 0 and not output_tokens[-1].endswith(
+                            ']'):
+                    output_tokens[-1] += token[2:]
+                else:
+                    output_tokens.append(token)
+            else:
+                output_tokens.append(str(token))
+        print(' '.join(output_tokens))
+        last_index = None
+        for i in range(sep, tokens.size(1)):
+            if global_tokenizer.IdToToken(
+                    tokens[batch_id, i].item()).startswith('<|startofpiece'):
+                if last_index is not None:
+                    print(
+                        global_tokenizer.DecodeIds(
+                            tokens[batch_id, last_index:i].tolist()), '|',
+                        global_tokenizer.DecodeIds(
+                            labels[batch_id, last_index:i].tolist())),
+                    print(position_ids_[batch_id, last_index:i].tolist(),
+                          block_position_ids[batch_id, last_index:i].tolist())
+                last_index = i
+        if last_index is not None:
+            print(
+                global_tokenizer.DecodeIds(tokens[batch_id,
+                                                  last_index:].tolist()), '|',
+                global_tokenizer.DecodeIds(labels[batch_id,
+                                                  last_index:].tolist()))
+            print(position_ids_[batch_id, last_index:].tolist(),
+                  block_position_ids[batch_id, last_index:].tolist())
+
+    # Forward model.
+    if args.continuous_prompt:
+        prompt_pos = data['prompt_pos'].long().cuda()
+        logits, *mems = model(
+            tokens, position_ids, attention_mask, *mems, prompt_pos=prompt_pos)
+    else:
+        logits, *mems = model(tokens, position_ids, attention_mask, *mems)
+
+    if eval_metric is None or eval_metric == 'loss':
+        losses = mpu.vocab_parallel_cross_entropy(logits.contiguous().float(),
+                                                  labels)
+        loss_mask = loss_mask.view(-1)
+        # The loss is not normalized for fair comparison
+        loss = torch.sum(losses.view(-1) * loss_mask)
+        if eval_metric is None:
+            loss = loss / loss_mask.sum()
+        return loss, mems, 'bert'
+    elif eval_metric == 'accuracy' or eval_metric == 'classify':
+        logits = mpu.gather_from_model_parallel_region(logits)
+        outputs = torch.argmax(logits, -1)
+        correct = (outputs == labels).float()
+        correct[(1 - loss_mask).bool()] = 1
+        correct = correct.prod(-1)
+        if eval_metric == 'accuracy':
+            correct = correct.sum()
+        return correct, mems, 'bert'
+    else:
+        raise NotImplementedError(
+            'Metric {} not implemented'.format(eval_metric))
+
+
+def classify_evaluate(model, dataloader, example_dict, args):
+    """Evaluation."""
+    # Turn on evaluation mode which disables dropout.
+    model.eval()
+    predictions, labels, examples = [], [], []
+    with torch.no_grad():
+        # For all the batches in the dataset.
+        for iteration, batch in enumerate(dataloader):
+            # Forward evaluation.
+            output, _, _ = lm_forward_step(
+                batch, model, args, None, [], eval_metric='classify')
+            uid_list = batch['uid']
+            example_batch = [example_dict[uid] for uid in uid_list]
+            predictions.extend(output.long().tolist())
+            label = batch['label'].tolist()
+            labels.extend(label)
+            examples.extend(example_batch)
+    return predictions, labels, examples
+
+
+def evaluate(model, dataloader, eval_metric, args):
+    """Evaluation."""
+    # Turn on evaluation mode which disables dropout.
+    model.eval()
+    total_output, total_count = 0.0, 0
+    total_tokens = 0
+    with torch.no_grad():
+        # For all the batches in the dataset.
+        for iteration, batch in enumerate(dataloader):
+            if (iteration + 1) % args.log_interval == 0:
+                print_rank_0('> working on iteration: {}'.format(iteration))
+            # Forward evaluation.
+            output, _, _ = lm_forward_step(
+                batch, model, args, None, [], eval_metric=eval_metric)
+            count = batch['text'].size(0)
+            count = torch.cuda.LongTensor([count])
+            # Reduce across processes.
+            torch.distributed.all_reduce(
+                output, group=mpu.get_data_parallel_group())
+            torch.distributed.all_reduce(
+                count, group=mpu.get_data_parallel_group())
+
+            total_output += output.item()
+            total_count += count.item()
+            total_tokens += batch['loss_mask'].sum().item()
+    totals = torch.cuda.FloatTensor([total_output, total_tokens])
+    torch.distributed.all_reduce(totals, group=mpu.get_data_parallel_group())
+    total_output, total_tokens = totals.tolist()
+    print(total_tokens)
+    return {eval_metric: total_output}, total_count
+
+
+def evaluate_and_print_results(data_loader, model, eval_metric, args):
+    """Evaluate and print results on screen."""
+
+    # Evaluate and get results.
+    output, _ = evaluate(model, data_loader, eval_metric, args)
+
+    string = ''
+    if eval_metric == 'loss':
+        output = output['loss']
+        num_tokenized_tokens = data_loader.dataset.num_tokenized_tokens
+        num_original_tokens = data_loader.dataset.num_original_tokens
+        val_loss = output / (num_tokenized_tokens - 1)
+        ppl = math.exp(min(20, val_loss))
+        token_ratio = (num_tokenized_tokens - 1) / (num_original_tokens - 1)
+        adjusted_ppl = math.exp(min(20, val_loss * token_ratio))
+        string += 'avg loss: {:.4E} | '.format(val_loss)
+        string += 'ppl: {:.4E} | '.format(ppl)
+        string += 'adjusted ppl: {:.4E} | '.format(adjusted_ppl)
+        string += 'token ratio: {} |'.format(token_ratio)
+        score_dict = {
+            'avg loss': val_loss,
+            'ppl': ppl,
+            'adjusted ppl': adjusted_ppl
+        }
+
+    elif eval_metric == 'accuracy':
+        output = output['accuracy']
+        num_examples = len(data_loader.dataset)
+        acc = output / num_examples * 100
+        string += 'number correct: {} | '.format(output)
+        string += 'total examples: {} | '.format(num_examples)
+        string += 'avg accuracy: {:.2f}'.format(acc)
+        score_dict = {'accuracy': acc}
+    else:
+        raise NotImplementedError('evaluation method for {} metric is not '
+                                  'implemented yet.'.format(eval_metric))
+
+    length = len(string) + 1
+    print_rank_0('-' * length)
+    print_rank_0(string)
+    print_rank_0('-' * length)
+    return score_dict
+
+
+def metrics_func_provider(args, tokenizer, is_test):
+    """Privde metrics callback function."""
+
+    if args.task.lower() == 'lambda':
+        eval_metric = 'accuracy'
+        dataset = build_lambada_dataset(tokenizer, args)
+    elif args.task == 'wikitext':
+        eval_metric = 'loss'
+        dataset = build_wikitext103_dataset(tokenizer, args)
+    elif args.task == 'language_model':
+        eval_metric = 'loss'
+        dataset = build_lm_dataset(tokenizer, args)
+    else:
+        raise NotImplementedError('{} task is not implemented.'.format(
+            args.task))
+    # Data stuff
+    dataloader = build_data_loader(
+        dataset,
+        args.eval_batch_size,
+        args.num_workers,
+        drop_last=False,
+        shuffle=False)
+
+    def metrics_func(model,
+                     epoch,
+                     output_predictions=False,
+                     summary_writer=None):
+        return evaluate_and_print_results(
+            dataloader, model, eval_metric=eval_metric, args=args)
+
+    global global_tokenizer
+    global_tokenizer = tokenizer
+    return metrics_func
+
+
+def main(args):
+    """Main program."""
+    finetune(
+        args,
+        None, {},
+        end_of_epoch_callback_provider=metrics_func_provider,
+        forward_step=lm_forward_step)
diff --git a/modelscope/models/nlp/mglm/tasks/seq2seq/dataset.py b/modelscope/models/nlp/mglm/tasks/seq2seq/dataset.py
new file mode 100644
index 00000000..6a4e275f
--- /dev/null
+++ b/modelscope/models/nlp/mglm/tasks/seq2seq/dataset.py
@@ -0,0 +1,667 @@
+# Copyright (c) 2022 Zhipu.AI
+
+import os
+import random
+
+import json
+import numpy as np
+import torch
+import torch.utils.data
+from data_utils.corpora import punctuation_standardization
+from tasks.data_utils import InputExample
+from tqdm import tqdm
+from utils import print_rank_0
+
+
+def gigaword_detokenize(string, is_target=False):
+    _tok_dict = {
+        '(': '-lrb-',
+        ')': '-rrb-',
+        '[': '-lsb-',
+        ']': '-rsb-',
+        '{': '-lcb-',
+        '}': '-rcb-',
+        '&': '&amp;',
+        '<': '&lt;',
+        '>': '&gt;'
+    }
+    string = string.replace('UNK', '[UNK]')
+    string = string.replace('<unk>', '[UNK]')
+    for key, value in _tok_dict.items():
+        string = string.replace(value, key)
+    # string = string.replace("''", "\"")
+    # string = string.replace("``", "\"")
+    # string = string.replace("`", "'")
+    # string = string.replace(" n't", "n't")
+    # string = string.replace(" 's", "'s")
+    # string = string.replace(" 'd", "'d")
+    # string = string.replace(" 'll", "'ll")
+    return string
+
+
+def cnndm_detokenize(string, is_target=False):
+    _tok_dict = {
+        '(': '-LRB-',
+        ')': '-RRB-',
+        '[': '-LSB-',
+        ']': '-RSB-',
+        '{': '-LCB-',
+        '}': '-RCB-'
+    }
+    if not is_target:
+        string = string.replace('<S_SEP>', '')
+    else:
+        string = string.replace('<S_SEP>', '[SEP]')
+    for key, value in _tok_dict.items():
+        string = string.replace(value, key)
+    string = string.replace("''", "\"")
+    string = string.replace('``', "\"")
+    string = string.replace('`', "'")
+    string = string.replace(" n't", "n't")
+    string = string.replace(" 's", "'s")
+    string = string.replace(" 'd", "'d")
+    string = string.replace(" 'll", "'ll")
+    return string
+
+
+def blanklm_detokenize(string, is_target=False):
+    string = string.replace('_UNK', '[UNK]')
+    string = string.replace('<blank>', '[MASK]')
+    return string
+
+
+class SummmaryProcessor:
+
+    def __init__(self, task, data_dir, tokenizer):
+        self.task = task
+        self.data_dir = data_dir
+        self.tokenizer = tokenizer
+
+    def create_examples(self, split):
+        if split == 'train':
+            filename = 'train'
+        elif split == 'dev':
+            filename = 'val'
+        elif split == 'test':
+            filename = 'test'
+        else:
+            raise NotImplementedError(split)
+        print_rank_0(
+            f'Creating {self.task}-{split} dataset from {self.data_dir}')
+        if self.task == 'gigaword':
+            detokenizer = gigaword_detokenize
+        elif self.task == 'cnn_dm':
+            detokenizer = cnndm_detokenize
+        else:
+            detokenizer = None
+        source_texts, target_texts = [], []
+        with open(
+                os.path.join(self.data_dir, f'{filename}.source'),
+                encoding='utf-8') as file:
+            for line in file:
+                line = line.strip()
+                line = punctuation_standardization(line)
+                line = detokenizer(line) if detokenizer else line
+                source_texts.append(line)
+        with open(
+                os.path.join(self.data_dir, f'{filename}.target'),
+                encoding='utf-8') as file:
+            for line in file:
+                line = line.strip()
+                line = punctuation_standardization(line)
+                line = detokenizer(
+                    line, is_target=True) if detokenizer else line
+                target_texts.append(line)
+        assert len(source_texts) == len(target_texts)
+        example_list = []
+        for idx, (source_text,
+                  target_text) in enumerate(zip(source_texts, target_texts)):
+            if (idx + 1) % 20000 == 0:
+                print_rank_0(f'Complete {idx + 1} examples')
+            guid = '%s-%s' % (split, idx)
+            meta = {
+                'ref':
+                self.tokenizer.DecodeIds(
+                    self.tokenizer.EncodeAsIds(target_text).tokenization)
+            }
+            example = InputExample(
+                guid=guid, text_a=source_text, text_b=target_text, meta=meta)
+            if idx < 10:
+                print_rank_0(
+                    (source_text.encode('utf-8'), target_text.encode('utf-8'),
+                     meta['ref'].encode('utf-8')))
+            example_list.append(example)
+        return example_list
+
+
+class SQuADProcessor:
+
+    def __init__(self, data_dir, tokenizer):
+        self.data_dir = data_dir
+        self.tokenizer = tokenizer
+
+    def create_examples(self, split):
+        if split == 'train':
+            filename = 'train.json'
+        elif split == 'dev':
+            filename = 'dev.json'
+        elif split == 'test':
+            filename = 'test.json'
+        else:
+            raise NotImplementedError(split)
+        print_rank_0(f'Creating SQuAD-{split} dataset from {self.data_dir}')
+        example_list = []
+        idx = 0
+        with open(
+                os.path.join(self.data_dir, filename),
+                encoding='utf-8') as file:
+            dataset = json.load(file)
+            for paragraphs in dataset:
+                for paragraph in paragraphs['paragraphs']:
+                    context = paragraph['context']
+                    for qa in paragraph['qas']:
+                        question = qa['question']
+                        answers = {answer['text'] for answer in qa['answers']}
+                        answer_starts = {
+                            answer['text']: answer['answer_start']
+                            for answer in qa['answers']
+                        }
+                        for answer in answers:
+                            guid = '%s-%s' % (split, idx)
+                            meta = {
+                                'answer_start':
+                                answer_starts[answer],
+                                'answer':
+                                answer,
+                                'question':
+                                question,
+                                'ref':
+                                self.tokenizer.DecodeIds(
+                                    self.tokenizer.EncodeAsIds(
+                                        question).tokenization)
+                            }
+                            example = InputExample(
+                                guid=guid, text_a=context, meta=meta)
+                            if idx < 10:
+                                print_rank_0((context.encode('utf-8'),
+                                              answer.encode('utf-8'),
+                                              meta['ref'].encode('utf-8')))
+                            example_list.append(example)
+                            idx += 1
+        print_rank_0(f'Creating {len(example_list)} examples for {split}')
+        return example_list
+
+
+class XSumProcessor:
+
+    def __init__(self, data_dir, tokenizer):
+        self.data_dir = data_dir
+        self.tokenizer = tokenizer
+
+    def create_examples(self, split):
+        if split == 'train':
+            key = 'train'
+        elif split == 'dev':
+            key = 'validation'
+        elif split == 'test':
+            key = 'test'
+        else:
+            raise NotImplementedError(split)
+        print_rank_0(f'Creating XSUM-{split} dataset from {self.data_dir}')
+        with open(
+                os.path.join(
+                    self.data_dir,
+                    'XSum-TRAINING-DEV-TEST-SPLIT-90-5-5.json')) as file:
+            id_list = json.load(file)
+        id_list = id_list[key]
+        source_texts, target_texts = [], []
+        for i, idx in enumerate(id_list):
+            with open(os.path.join(self.data_dir, f'{idx}.summary')) as file:
+                key, sentences = None, []
+                source_text, target_text = None, None
+                for line in file:
+                    line = line.strip()
+                    if line.startswith('[SN]'):
+                        if key is not None:
+                            if key == 'RESTBODY':
+                                source_text = ' '.join(sentences)
+                            elif key == 'FIRST-SENTENCE':
+                                target_text = ' '.join(sentences)
+                        key = line[4:-4]
+                        sentences = []
+                    elif line:
+                        sentences.append(line)
+                if key is not None:
+                    if key == 'RESTBODY':
+                        source_text = ' '.join(sentences)
+                    elif key == 'FIRST-SENTENCE':
+                        target_text = ' '.join(sentences)
+                source_texts.append(source_text)
+                target_texts.append(target_text)
+                if (i + 1) % 1000 == 0:
+                    print_rank_0(f'Complete {i + 1} examples')
+        assert len(source_texts) == len(target_texts)
+        example_list = []
+        for idx, (source_text,
+                  target_text) in enumerate(zip(source_texts, target_texts)):
+            if (idx + 1) % 20000 == 0:
+                print_rank_0(f'Complete {idx + 1} examples')
+            guid = '%s-%s' % (split, idx)
+            meta = {
+                'ref':
+                self.tokenizer.DecodeIds(
+                    self.tokenizer.EncodeAsIds(target_text).tokenization)
+            }
+            example = InputExample(
+                guid=guid, text_a=source_text, text_b=target_text, meta=meta)
+            if idx < 10:
+                print_rank_0(
+                    (source_text.encode('utf-8'), target_text.encode('utf-8'),
+                     meta['ref'].encode('utf-8')))
+            example_list.append(example)
+        return example_list
+
+
+class Seq2SeqDataset(torch.utils.data.Dataset):
+
+    def __init__(self, args, split, tokenizer):
+        self.args = args
+        self.task, self.data_dir = args.task.lower(), args.data_dir
+        self.max_src_length, self.max_tgt_length = args.src_seq_length, args.tgt_seq_length
+        self.split = split
+        self.tokenizer = tokenizer
+        self.dataset_name = split
+        if self.task in ['gigaword', 'cnn_dm', 'cnn_dm_original']:
+            self.processor = SummmaryProcessor(self.task, self.data_dir,
+                                               tokenizer)
+        elif self.task in ['xsum']:
+            self.processor = XSumProcessor(self.data_dir, tokenizer)
+        elif self.task in ['squad_generation']:
+            self.processor = SQuADProcessor(self.data_dir, tokenizer)
+        else:
+            raise NotImplementedError
+        example_list = self.processor.create_examples(split)
+        self.example_list = example_list
+        self.examples = {example.guid: example for example in example_list}
+
+        print_rank_0(f'Return {len(self.examples)} {split} examples')
+
+    def __len__(self):
+        return len(self.example_list)
+
+    def __getitem__(self, idx):
+        example = self.example_list[idx]
+        cls_id = self.tokenizer.get_command('ENC').Id
+        mask_token = 'sMASK' if self.args.task_mask else 'MASK'
+        mask_id = self.tokenizer.get_command(mask_token).Id
+        pad_id = self.tokenizer.get_command('pad').Id
+        sop_id = self.tokenizer.get_command('sop').Id
+        eop_id = self.tokenizer.get_command('eop').Id
+        if self.task in ['gigaword', 'cnn_dm', 'cnn_dm_original', 'xsum']:
+            source_text, target_text = example.text_a, example.text_b
+            source_tokens = self.tokenizer.EncodeAsIds(
+                ' ' + source_text).tokenization
+            prompt = [cls_id, mask_id
+                      ] + self.tokenizer.EncodeAsIds(' Content:').tokenization
+            if len(source_tokens) > self.max_src_length - len(prompt):
+                source_tokens = source_tokens[:self.max_src_length
+                                              - len(prompt)]
+            source_tokens = prompt + source_tokens
+        elif self.task == 'squad_generation':
+            source_text = example.text_a
+            target_text, answer = example.meta['question'], example.meta[
+                'answer']
+            source_tokens = self.tokenizer.EncodeAsIds(
+                source_text.rstrip() + ' Question:').tokenization
+            answer_tokens = self.tokenizer.EncodeAsIds(' Answer: '
+                                                       + answer).tokenization
+            if len(source_tokens
+                   ) > self.max_src_length - len(answer_tokens) - 2:
+                max_src_length = self.max_src_length - len(answer_tokens) - 2
+                answer_pattern = self.tokenizer.EncodeAsIds(
+                    ' ' + answer).tokenization
+
+                def sub_finder(mylist, pattern):
+                    matches = []
+                    for i in range(len(mylist)):
+                        if mylist[i] == pattern[0] and mylist[
+                                i:i + len(pattern)] == pattern:
+                            matches.append(i)
+                    return matches
+
+                answer_indices = sub_finder(source_tokens, answer_pattern)
+                if len(answer_indices) == 0:
+                    print(f'Answer {answer} not exists in the source text')
+                    source_tokens = source_tokens[:max_src_length]
+                else:
+                    start_index = max(answer_indices[0] - max_src_length // 2,
+                                      0)
+                    source_tokens = source_tokens[start_index:start_index
+                                                  + max_src_length]
+            source_tokens = [cls_id] + source_tokens + [mask_id
+                                                        ] + answer_tokens
+        else:
+            raise NotImplementedError
+        if len(source_tokens) < self.max_src_length:
+            source_tokens = source_tokens + [pad_id] * (
+                self.max_src_length - len(source_tokens))
+        sep = len(source_tokens)
+        position_ids = list(range(len(source_tokens)))
+        block_position_ids = [0] * len(source_tokens)
+        mask_pos = source_tokens.index(mask_id)
+        if self.split == 'train':
+            target_tokens = self.tokenizer.EncodeAsIds(
+                ' ' + target_text).tokenization
+            target_tokens = target_tokens + [eop_id]
+            if len(target_tokens) > self.max_tgt_length:
+                target_tokens = target_tokens[:self.max_tgt_length]
+            loss_mask = [1] * len(target_tokens)
+            if len(target_tokens) < self.max_tgt_length:
+                loss_mask += [0] * (self.max_tgt_length - len(target_tokens))
+                target_tokens += [pad_id] * (
+                    self.max_tgt_length - len(target_tokens))
+            tokens = source_tokens + [sop_id] + target_tokens[:-1]
+            loss_mask = [0] * len(source_tokens) + loss_mask
+            target_ids = [0] * len(source_tokens) + target_tokens
+            position_ids += [mask_pos] * len(target_tokens)
+            if self.args.no_block_position:
+                block_position_ids += [1] * len(target_tokens)
+            else:
+                block_position_ids += list(range(1, len(target_tokens) + 1))
+            position_ids = [position_ids, block_position_ids]
+            sample = {
+                'text': np.array(tokens, dtype=np.int64),
+                'target': np.array(target_ids, dtype=np.int64),
+                'attention_mask': np.array(sep, dtype=np.int64),
+                'loss_mask': np.array(loss_mask, dtype=np.int64),
+                'position_id': np.array(position_ids, dtype=np.int64),
+                'uid': example.guid
+            }
+        else:
+            tokens = source_tokens + [sop_id]
+            position_ids = position_ids + [mask_pos]
+            block_position_ids = block_position_ids + [1]
+            position_ids = [position_ids, block_position_ids]
+            sample = {
+                'text': np.array(tokens, dtype=np.int64),
+                'attention_mask': np.array(sep, dtype=np.int64),
+                'position_id': np.array(position_ids, dtype=np.int64),
+                'uid': example.guid
+            }
+        return sample
+
+
+class ExtractionDataset(torch.utils.data.Dataset):
+
+    def __init__(self, args, split, tokenizer):
+        self.args = args
+        task, data_dir = args.task.lower(), args.data_dir
+        self.max_src_length, self.max_tgt_length = args.src_seq_length, args.tgt_seq_length
+        self.split = split
+        self.tokenizer = tokenizer
+        if split == 'train':
+            filename = 'train'
+        elif split == 'dev':
+            filename = 'valid'
+        elif split == 'test':
+            filename = 'test'
+        else:
+            raise NotImplementedError(split)
+        print_rank_0(f'Creating {task}-{split} dataset from {data_dir}')
+        self.dataset_name = split
+        source_texts, target_texts = [], []
+        with open(
+                os.path.join(data_dir, f'{filename}.source'),
+                encoding='utf-8') as file:
+            for line in file:
+                line = line.strip()
+                source_texts.append(line)
+        with open(
+                os.path.join(data_dir, f'{filename}.target'),
+                encoding='utf-8') as file:
+            for line in file:
+                line = line.strip()
+                target_texts.append(line)
+        self.examples, self.example_list = {}, []
+        for idx, (source_text,
+                  target_text) in enumerate(zip(source_texts, target_texts)):
+            if (idx + 1) % 20000 == 0:
+                print_rank_0(f'Complete {idx + 1} examples')
+            guid = '%s-%s' % (split, idx)
+            meta = {'ref': target_text}
+            example = InputExample(
+                guid=guid, text_a=source_text, text_b=target_text, meta=meta)
+            self.examples[guid] = example
+            self.example_list.append(example)
+        print_rank_0(f'Return {len(self.examples)} {split} examples')
+
+    def __len__(self):
+        return len(self.example_list)
+
+    def __getitem__(self, idx):
+        example = self.example_list[idx]
+        source_text, target_text = example.text_a, example.text_b
+        mask_token = 'MASK'
+        mask_id = self.tokenizer.get_command(mask_token).Id
+        sop_id = self.tokenizer.get_command('sop').Id
+        eop_id = self.tokenizer.get_command('eop').Id
+        pad_id = self.tokenizer.get_command('pad').Id
+
+        def pad_to(text, max_len, pad_id):
+            if len(text) > max_len:
+                text = text[:max_len]
+            else:
+                text = text + [pad_id] * (max_len - len(text))
+            return text
+
+        source_tokens = self.tokenizer.EncodeAsIds(source_text).tokenization
+        masked_tgt = target_text.split('|')
+        source_tokens = pad_to(source_tokens, self.max_src_length, pad_id)
+        sep = len(source_tokens)
+        position_ids = list(range(len(source_tokens)))
+        block_position_ids = [0] * len(source_tokens)
+        if self.split == 'train':
+            mask_positions = [
+                i for i, x in enumerate(source_tokens) if x == mask_id
+            ]
+            assert len(mask_positions) <= len(masked_tgt)
+            tokens = source_tokens
+            target_ids = [0] * len(source_tokens)
+            loss_mask = [0] * len(source_tokens)
+            for i, mask_pos in enumerate(mask_positions):
+                tgt_text = masked_tgt[i]
+                tgt_tokens = self.tokenizer.EncodeAsIds(
+                    ' ' + tgt_text).tokenization
+                tokens += [sop_id] + tgt_tokens
+                target_ids += tgt_tokens + [eop_id]
+                loss_mask += [1] * (len(tgt_tokens) + 1)
+                position_ids += [mask_pos] * (len(tgt_tokens) + 1)
+                block_position_ids += [
+                    i + 1 for i in range(len(tgt_tokens) + 1)
+                ]
+            tokens = pad_to(tokens, self.max_src_length + self.max_tgt_length,
+                            pad_id)
+            target_ids = pad_to(target_ids,
+                                self.max_src_length + self.max_tgt_length,
+                                pad_id)
+            loss_mask = pad_to(loss_mask,
+                               self.max_src_length + self.max_tgt_length, 0)
+            position_ids = pad_to(position_ids,
+                                  self.max_src_length + self.max_tgt_length, 0)
+            block_position_ids = pad_to(
+                block_position_ids, self.max_src_length + self.max_tgt_length,
+                0)
+            position_ids = [position_ids, block_position_ids]
+            sample = {
+                'text': np.array(tokens, dtype=np.int64),
+                'target': np.array(target_ids, dtype=np.int64),
+                'attention_mask': np.array(sep, dtype=np.int64),
+                'loss_mask': np.array(loss_mask, dtype=np.int64),
+                'position_id': np.array(position_ids, dtype=np.int64),
+                'uid': example.guid
+            }
+        else:
+            tokens = source_tokens + [sop_id]
+            mask_pos = source_tokens.index(mask_id)
+            position_ids = position_ids + [mask_pos]
+            block_position_ids = block_position_ids + [1]
+            position_ids = [position_ids, block_position_ids]
+            sample = {
+                'text': np.array(tokens, dtype=np.int64),
+                'attention_mask': np.array(sep, dtype=np.int64),
+                'position_id': np.array(position_ids, dtype=np.int64),
+                'uid': example.guid
+            }
+        return sample
+
+
+class BlankLMDataset(torch.utils.data.Dataset):
+
+    def __init__(self, args, split, tokenizer):
+        self.args = args
+        task, data_dir = args.task.lower(), args.data_dir
+        self.max_src_length, self.max_tgt_length = args.src_seq_length, args.tgt_seq_length
+        self.split = split
+        assert args.tokenizer_type == 'BertWordPieceTokenizer'
+        self.tokenizer = tokenizer
+        if split == 'train':
+            filename = 'train'
+        elif split == 'dev':
+            filename = 'valid'
+        elif split == 'test':
+            filename = 'test'
+        else:
+            raise NotImplementedError(split)
+        print_rank_0(f'Creating {task}-{split} dataset from {data_dir}')
+        self.dataset_name = split
+        detokenizer = blanklm_detokenize
+        source_texts, target_texts = [], []
+        with open(
+                os.path.join(data_dir, f'{filename}.txt'),
+                encoding='utf-8') as file:
+            for line in file:
+                line = line.strip()
+                line = detokenizer(line) if detokenizer else line
+                target_texts.append(line)
+        if split == 'test':
+            with open(
+                    os.path.join(
+                        data_dir,
+                        f'blank/test.maskratio{args.blank_maskratio:.1f}.blank'
+                    ),
+                    encoding='utf-8') as file:
+                for line in file:
+                    line = line.strip()
+                    line = detokenizer(line) if detokenizer else line
+                    source_texts.append(line)
+        else:
+            source_texts = target_texts
+        self.examples, self.example_list = {}, []
+        for idx, (source_text,
+                  target_text) in enumerate(zip(source_texts, target_texts)):
+            # if idx > 10000:
+            #     break
+            if (idx + 1) % 20000 == 0:
+                print_rank_0(f'Complete {idx + 1} examples')
+            guid = '%s-%s' % (split, idx)
+            meta = {'ref': target_text}
+            example = InputExample(
+                guid=guid, text_a=source_text, text_b=target_text, meta=meta)
+            self.examples[guid] = example
+            self.example_list.append(example)
+        print_rank_0(f'Return {len(self.examples)} {split} examples')
+        self.random = random.Random(args.seed)
+
+    def __len__(self):
+        return len(self.example_list)
+
+    def __getitem__(self, idx):
+        example = self.example_list[idx]
+        source_text, target_text = example.text_a, example.text_b  # noqa
+        mask_token = 'gMASK' if self.args.task_mask else 'MASK'
+        mask_id = self.tokenizer.get_command(mask_token).Id
+        sop_id = self.tokenizer.get_command('sop').Id
+        eop_id = self.tokenizer.get_command('eop').Id
+        pad_id = self.tokenizer.get_command('pad').Id
+        if self.split in ['train', 'dev']:
+            masked_src, masked_tgt = self.mask_text(source_text)
+            source_text = masked_src
+
+        def pad_to(text, max_len, pad_id):
+            if len(text) > max_len:
+                text = text[:max_len]
+            else:
+                text = text + [pad_id] * (max_len - len(text))
+            return text
+
+        source_tokens = self.tokenizer.EncodeAsIds(' '
+                                                   + source_text).tokenization
+        source_tokens = pad_to(source_tokens, self.max_src_length, pad_id)
+        sep = len(source_tokens)
+        position_ids = list(range(len(source_tokens)))
+        block_position_ids = [0] * len(source_tokens)
+        if self.split in ['train', 'dev']:
+            mask_positions = [
+                i for i, x in enumerate(source_tokens) if x == mask_id
+            ]
+            assert len(mask_positions) <= len(masked_tgt)
+            tokens = source_tokens
+            target_ids = [0] * len(source_tokens)
+            loss_mask = [0] * len(source_tokens)
+            for i, mask_pos in enumerate(mask_positions):
+                tgt_text = masked_tgt[i]
+                tgt_tokens = self.tokenizer.EncodeAsIds(
+                    ' ' + tgt_text).tokenization
+                tokens += [sop_id] + tgt_tokens
+                target_ids += tgt_tokens + [eop_id]
+                loss_mask += [1] * (len(tgt_tokens) + 1)
+                position_ids += [mask_pos] * (len(tgt_tokens) + 1)
+                block_position_ids += [
+                    i + 1 for i in range(len(tgt_tokens) + 1)
+                ]
+            max_length = self.max_src_length + int(
+                self.max_src_length * self.args.blank_maskratio)
+            tokens = pad_to(tokens, max_length, pad_id)
+            target_ids = pad_to(target_ids, max_length, pad_id)
+            loss_mask = pad_to(loss_mask, max_length, 0)
+            position_ids = pad_to(position_ids, max_length, 0)
+            block_position_ids = pad_to(block_position_ids, max_length, 0)
+            position_ids = [position_ids, block_position_ids]
+            sample = {
+                'text': np.array(tokens, dtype=np.int64),
+                'target': np.array(target_ids, dtype=np.int64),
+                'attention_mask': np.array(sep, dtype=np.int64),
+                'loss_mask': np.array(loss_mask, dtype=np.int64),
+                'position_id': np.array(position_ids, dtype=np.int64),
+                'uid': example.guid
+            }
+        else:
+            tokens = source_tokens + [sop_id]
+            mask_pos = source_tokens.index(mask_id)
+            position_ids = position_ids + [mask_pos]
+            block_position_ids = block_position_ids + [1]
+            position_ids = [position_ids, block_position_ids]
+            sample = {
+                'text': np.array(tokens, dtype=np.int64),
+                'attention_mask': np.array(sep, dtype=np.int64),
+                'position_id': np.array(position_ids, dtype=np.int64),
+                'uid': example.guid
+            }
+        return sample
+
+    def mask_text(self, text):
+        tokens = text.split()
+        mask_ratio = self.args.blank_maskratio
+        n = len(tokens)
+        indices = sorted(self.random.sample(range(n), int(n * mask_ratio)))
+        masked_src, masked_tgt = '', []
+        for i, idx in enumerate(indices):
+            if i == 0 or idx != indices[i - 1] + 1:
+                masked_tgt.append('')
+            masked_tgt[-1] += ' ' + tokens[idx]
+            tokens[idx] = '[MASK]'
+        for i, token in enumerate(tokens):
+            if i != 0 and token == '[MASK]' and tokens[i - 1] == '[MASK]':
+                continue
+            masked_src += ' ' + token
+        return masked_src, masked_tgt
diff --git a/modelscope/models/nlp/mglm/tasks/seq2seq/evaluate.py b/modelscope/models/nlp/mglm/tasks/seq2seq/evaluate.py
new file mode 100644
index 00000000..5fd28b89
--- /dev/null
+++ b/modelscope/models/nlp/mglm/tasks/seq2seq/evaluate.py
@@ -0,0 +1,538 @@
+# Copyright (c) 2022 Zhipu.AI
+
+import datetime
+import random
+import string
+
+import mpu
+import torch
+import torch.nn.functional as F
+from generation_utils import (BeamSearchScorer, LogitsProcessorList,
+                              MinLengthLogitsProcessor,
+                              NoRepeatNGramLogitsProcessor)
+from rouge_score import rouge_scorer
+from utils import print_rank_0
+
+
+def _is_digit(w):
+    for ch in w:
+        if not (ch.isdigit() or ch == ','):
+            return False
+    return True
+
+
+gigaword_tok_dict = {
+    '(': '-lrb-',
+    ')': '-rrb-',
+    '[': '-lsb-',
+    ']': '-rsb-',
+    '{': '-lcb-',
+    '}': '-rcb-',
+    '[UNK]': 'UNK',
+    '&': '&amp;',
+    '<': '&lt;',
+    '>': '&gt;'
+}
+
+cnndm_tok_dict = {
+    '(': '-LRB-',
+    ')': '-RRB-',
+    '[': '-LSB-',
+    ']': '-RSB-',
+    '{': '-LCB-',
+    '}': '-RCB-'
+}
+
+
+def fix_tokenization(text, dataset):
+    if dataset == 'cnn_dm_org':
+        return text
+    if dataset == 'gigaword':
+        text = text.replace('[UNK]', 'UNK')
+        return text
+    input_tokens = text.split()
+    output_tokens = []
+    has_left_quote = False
+    has_left_single_quote = False
+
+    i = 0
+    prev_dash = False
+    while i < len(input_tokens):
+        tok = input_tokens[i]
+        flag_prev_dash = False
+        if tok == "\"":
+            if has_left_quote:
+                output_tokens.append("''")
+            else:
+                output_tokens.append('``')
+            has_left_quote = not has_left_quote
+            i += 1
+        elif tok == "'" and len(
+                output_tokens) > 0 and output_tokens[-1].endswith(
+                    'n') and i < len(input_tokens) - 1 and input_tokens[
+                        i + 1] == 't':  # noqa
+            output_tokens[-1] = output_tokens[-1][:-1]
+            output_tokens.append("n't")
+            i += 2
+        elif tok == "'" and i < len(input_tokens) - 1 and input_tokens[
+                i + 1] in ('s', 'd', 'll'):
+            output_tokens.append("'" + input_tokens[i + 1])
+            i += 2
+        elif tok == "'":
+            if has_left_single_quote:
+                output_tokens.append("'")
+            else:
+                output_tokens.append('`')
+            has_left_single_quote = not has_left_single_quote
+            i += 1
+        elif tok == '.' and i < len(input_tokens) - 2 and input_tokens[
+                i + 1] == '.' and input_tokens[i + 2] == '.':
+            output_tokens.append('...')
+            i += 3
+        elif tok == ',' and len(output_tokens) > 0 and _is_digit(
+                output_tokens[-1]) and i < len(input_tokens) - 1 and _is_digit(
+                    input_tokens[i + 1]):
+            # $ 3 , 000 -> $ 3,000
+            output_tokens[-1] += ',' + input_tokens[i + 1]
+            i += 2
+        elif tok == '.' and len(output_tokens) > 0 and output_tokens[-1].isdigit() and i < len(input_tokens) - 1 and \
+                input_tokens[i + 1].isdigit():
+            # 3 . 03 -> $ 3.03
+            output_tokens[-1] += '.' + input_tokens[i + 1]
+            i += 2
+        elif tok == '.' and len(output_tokens) > 0 and len(
+                output_tokens[-1]) == 1 and output_tokens[-1].isalpha(  # noqa
+                ) and i < len(input_tokens) - 2 and len(  # noqa
+                    input_tokens[i + 1]) == 1 and input_tokens[
+                        i + 1].isalpha(  # noqa
+                        ) and input_tokens[i + 2] == '.':  # noqa
+            # U . N . -> U.N.
+            k = i + 3
+            while k + 2 < len(input_tokens):
+                if len(input_tokens[k + 1]) == 1 and input_tokens[
+                        k + 1].isalpha() and input_tokens[k + 2] == '.':
+                    k += 2
+                else:
+                    break
+            output_tokens[-1] += ''.join(input_tokens[i:k])
+            i = k
+        elif tok == '-':
+            if i < len(input_tokens) - 1 and input_tokens[i + 1] == '-':
+                output_tokens.append('--')
+                i += 2
+            elif i == len(input_tokens) - 1 or i == 0:
+                output_tokens.append('-')
+                i += 1
+            elif output_tokens[-1] not in string.punctuation and input_tokens[
+                    i + 1][0] not in string.punctuation:
+                output_tokens[-1] += '-'
+                i += 1
+                flag_prev_dash = True
+            else:
+                output_tokens.append('-')
+                i += 1
+        elif prev_dash and len(
+                output_tokens) > 0 and tok[0] not in string.punctuation:
+            output_tokens[-1] += tok
+            i += 1
+        else:
+            output_tokens.append(tok)
+            i += 1
+        prev_dash = flag_prev_dash
+    return ' '.join(output_tokens)
+
+
+def count_tokens(tokens):
+    counter = {}
+    for t in tokens:
+        if t in counter.keys():
+            counter[t] += 1
+        else:
+            counter[t] = 1
+    return counter
+
+
+def get_f1(text_a, text_b):
+    tokens_a = text_a.lower().split()
+    tokens_b = text_b.lower().split()
+    if len(tokens_a) == 0 or len(tokens_b) == 0:
+        return 1 if len(tokens_a) == len(tokens_b) else 0
+    set_a = count_tokens(tokens_a)
+    set_b = count_tokens(tokens_b)
+    match = 0
+    for token in set_a.keys():
+        if token in set_b.keys():
+            match += min(set_a[token], set_b[token])
+    p = match / len(tokens_a)
+    r = match / len(tokens_b)
+    return 2.0 * p * r / (p + r + 1e-5)
+
+
+def remove_duplicate(l_list, duplicate_rate):
+    tk_list = [l.lower().split() for l in l_list]  # noqa
+    r_list = []
+    history_set = set()
+    for i, w_list in enumerate(tk_list):
+        w_set = set(w_list)
+        if len(w_set & history_set) / len(w_set) <= duplicate_rate:
+            r_list.append(l_list[i])
+        history_set |= w_set
+    return r_list
+
+
+def rouge_metric(predictions,
+                 labels,
+                 examples,
+                 metric='rouge-1',
+                 duplicate_rate=0.7,
+                 dataset='cnn_dm'):
+    metric_dict = {
+        'rouge-1': 'rouge1',
+        'rouge-2': 'rouge2',
+        'rouge-l': 'rougeLsum'
+    }
+    refs = [example.meta['ref'] for example in examples]
+    ref_list = []
+    for ref in refs:
+        ref = ref.strip().split('[SEP]')
+        ref = [fix_tokenization(sentence, dataset=dataset) for sentence in ref]
+        ref = '\n'.join(ref)
+        ref_list.append(ref)
+    pred_list = []
+    for prediction in predictions:
+        buf = []
+        for sentence in prediction.strip().split('[SEP]'):
+            sentence = fix_tokenization(sentence, dataset=dataset)
+            if any(get_f1(sentence, s) > 1.0 for s in buf):
+                continue
+            s_len = len(sentence.split())
+            if s_len <= 4:
+                continue
+            buf.append(sentence)
+        if duplicate_rate and duplicate_rate < 1:
+            buf = remove_duplicate(buf, duplicate_rate)
+        line = '\n'.join(buf)
+        pred_list.append(line)
+    if torch.distributed.get_rank() == 0:
+        import json
+        with open('./results.json', 'w') as output:
+            for ref, pred in zip(ref_list, pred_list):
+                output.write(json.dumps({'ref': ref, 'pred': pred}) + '\n')
+    scorer = rouge_scorer.RougeScorer([metric_dict[metric]], use_stemmer=True)
+    scores = [
+        scorer.score(pred, ref) for pred, ref in zip(pred_list, ref_list)
+    ]
+    scores = [score[metric_dict[metric]].fmeasure for score in scores]
+    scores = sum(scores) / len(scores)
+    return scores
+
+
+def process_batch(batch, args):
+    """Process batch and produce inputs for the model."""
+    tokens = batch['text'].long().cuda()
+    attention_mask = batch['attention_mask'].long().cuda()
+    position_ids = batch['position_id'].long().cuda()
+    return tokens, attention_mask, position_ids
+
+
+class DecoderEvaluater:
+
+    def __init__(self, args, tokenizer):
+        self.tokenizer = tokenizer
+        self.start_token = tokenizer.get_command('sop').Id
+        self.end_token = tokenizer.get_command('eop').Id
+        self.mask_token = tokenizer.get_command(
+            'sMASK').Id if args.task_mask else tokenizer.get_command('MASK').Id
+        self.pad_token = tokenizer.get_command('pad').Id
+        self.processors = LogitsProcessorList()
+        if args.min_tgt_length > 0:
+            processor = MinLengthLogitsProcessor(args.min_tgt_length,
+                                                 self.end_token)
+            self.processors.append(processor)
+        if args.no_repeat_ngram_size > 0:
+            processor = NoRepeatNGramLogitsProcessor(args.no_repeat_ngram_size)
+            self.processors.append(processor)
+
+    def evaluate(self, model, dataloader, example_dict, args):
+        """Calculate correct over total answers and return prediction if the
+        `output_predictions` is true."""
+        model.eval()
+        store = torch.distributed.TCPStore(args.master_ip,
+                                           18931 + random.randint(0, 10000),
+                                           mpu.get_data_parallel_world_size(),
+                                           torch.distributed.get_rank() == 0,
+                                           datetime.timedelta(seconds=30))
+        print_rank_0('Distributed store created')
+        with torch.no_grad():
+            # For all the batches in the dataset.
+            for idx, data in enumerate(dataloader):
+                tokens, attention_mask, position_ids = process_batch(
+                    data, args)
+                batch_size = tokens.size(0)
+                beam_scorer = BeamSearchScorer(
+                    batch_size=batch_size,
+                    max_length=args.out_seq_length,
+                    num_beams=args.num_beams,
+                    device=tokens.device,
+                    length_penalty=args.length_penalty,
+                    do_early_stopping=False,
+                )
+                beam_scores = torch.zeros((batch_size, args.num_beams),
+                                          dtype=torch.float,
+                                          device=tokens.device)
+                beam_scores[:, 1:] = -1e9
+                beam_scores = beam_scores.view((batch_size * args.num_beams, ))
+                # Run the model forward.
+                counter = 0
+                while counter < args.tgt_seq_length:
+                    if counter == 0:
+                        next_token_logits, *mems = model(
+                            tokens,
+                            position_ids,
+                            attention_mask,
+                            return_memory=True)
+                        seq_length = next_token_logits.size(1)
+                        next_token_logits = next_token_logits[:, -1]
+                        next_token_logits = next_token_logits.unsqueeze(
+                            1).repeat(1, args.num_beams,
+                                      1).view(batch_size * args.num_beams, -1)
+                        mems = [
+                            mem.unsqueeze(1).repeat(
+                                1, args.num_beams, 1,
+                                1).view(batch_size * args.num_beams,
+                                        seq_length, -1) for mem in mems
+                        ]
+                        position_ids = tokens.new_ones(batch_size,
+                                                       args.num_beams, 2, 1)
+                        for i, text in enumerate(tokens.tolist()):
+                            mask_pos = text.index(self.mask_token)
+                            position_ids[i, :, 0] = mask_pos
+                        position_ids = position_ids.reshape(
+                            batch_size * args.num_beams, 2, 1)
+                        tokens = tokens.new_zeros(batch_size * args.num_beams,
+                                                  0)
+                        attention_mask = tokens.new_zeros(
+                            [batch_size * args.num_beams])
+                    else:
+                        if not args.no_block_position:
+                            position_ids[:, 1] = counter + 1
+                        last_token = tokens[:, -1:]
+                        next_token_logits, *mems = model(
+                            last_token,
+                            position_ids,
+                            attention_mask,
+                            *mems,
+                            return_memory=True)
+                        next_token_logits = next_token_logits[:, -1]
+                    next_token_scores = F.log_softmax(
+                        next_token_logits, dim=-1)
+                    next_token_scores = self.processors(
+                        tokens, next_token_scores)
+                    next_token_scores = next_token_scores + beam_scores[:, None].expand_as(
+                        next_token_scores)
+                    vocab_size = next_token_scores.shape[-1]
+                    next_token_scores = next_token_scores.view(
+                        batch_size, args.num_beams * vocab_size)
+
+                    probs = F.softmax(next_token_scores, dim=-1)
+                    if args.select_topk:
+                        _, next_tokens = torch.topk(
+                            probs, k=2 * args.num_beams, dim=-1, largest=True)
+                    else:
+                        next_tokens = torch.multinomial(
+                            probs, num_samples=2 * args.num_beams)
+                    next_token_scores = torch.gather(next_token_scores, -1,
+                                                     next_tokens)
+                    next_token_scores, _indices = torch.sort(
+                        next_token_scores, descending=True, dim=1)
+                    next_tokens = torch.gather(next_tokens, -1, _indices)
+
+                    next_indices = next_tokens // vocab_size
+                    next_tokens = next_tokens % vocab_size
+                    # stateless
+                    beam_outputs = beam_scorer.process(
+                        tokens,
+                        next_token_scores,
+                        next_tokens,
+                        next_indices,
+                        eos_token_id=self.end_token,
+                        pad_token_id=self.pad_token)
+                    beam_scores = beam_outputs['next_beam_scores']
+                    beam_next_tokens = beam_outputs['next_beam_tokens']
+                    beam_idx = beam_outputs['next_beam_indices']
+                    beam_next_tokens = beam_next_tokens.unsqueeze(-1)
+                    tokens = torch.cat([tokens[beam_idx, :], beam_next_tokens],
+                                       dim=-1)
+                    mems = [mem[beam_idx] for mem in mems] if mems else []
+                    if beam_scorer.is_done:
+                        break
+                    counter += 1
+                tokens, _ = beam_scorer.finalize(
+                    tokens,
+                    beam_scores,
+                    next_tokens,
+                    next_indices,
+                    eos_token_id=self.end_token,
+                    pad_token_id=self.pad_token)
+                predictions = []
+                for text in tokens.tolist():
+                    text = [
+                        token for token in text
+                        if token not in [self.end_token, self.pad_token]
+                    ]
+                    text = self.tokenizer.DecodeIds(text)
+                    predictions.append(text)
+                uid_list = data['uid']
+                if isinstance(uid_list, torch.Tensor):
+                    uid_list = uid_list.cpu().numpy().tolist()
+                for uid, prediction in zip(uid_list, predictions):
+                    store.set(uid, prediction)
+                if (idx + 1) % args.log_interval == 0:
+                    print_rank_0(f'Iteration {idx + 1} / {len(dataloader)}')
+        model.train()
+        torch.distributed.barrier()
+        print_rank_0('Evaluation completed')
+        predictions, examples = [], []
+        for uid, example in example_dict.items():
+            predictions.append(store.get(uid).decode('utf-8'))
+            examples.append(example)
+        torch.distributed.barrier()
+        return predictions, [], examples
+
+
+def blanklm_fix_tokenization(text):
+    text = text.replace('` `', '``')
+    text = text.replace("\' \'", "\'\'")
+    text = text.replace("n \' t", "n\'t")
+    text = text.replace("\' s", "\'s")
+    text = text.replace("\' m", "\'m")
+    text = text.replace("\' re", "\'re")
+    text = text.replace('. . .', '...')
+    text = text.replace(' . .', ' ..')
+    text = text.replace('- -', '--')
+    text = text.replace('u . s .', 'u.s.')
+    text = text.replace('u . k .', 'u.k.')
+    text = text.replace('e . g .', 'e.g.')
+    return text
+
+
+class BlankLMEvaluater(DecoderEvaluater):
+
+    def evaluate(self, model, dataloader, example_dict, args):
+        model.eval()
+        store = torch.distributed.TCPStore(args.master_ip,
+                                           18931 + random.randint(0, 10000),
+                                           mpu.get_data_parallel_world_size(),
+                                           torch.distributed.get_rank() == 0,
+                                           datetime.timedelta(seconds=30))
+        print_rank_0('Distributed store created')
+
+        with torch.no_grad():
+            for idx, data in enumerate(dataloader):
+                tokens, attention_mask, position_ids = process_batch(
+                    data, args)
+                src_tokens = tokens
+                batch_size = tokens.size(0)
+                mask_positions = []
+                current_mask = []
+                for text in tokens.tolist():
+                    mask_positions.append([
+                        i for i, x in enumerate(text) if x == self.mask_token
+                    ])
+                    current_mask.append(0)
+                    # print(self.tokenizer.DecodeIds(text))
+                    # print(mask_positions[-1])
+                counter = 0
+                done = [False] * batch_size
+                while counter < args.tgt_seq_length:
+                    if counter == 0:
+                        # print(tokens)
+                        # print(position_ids)
+                        next_token_logits, *mems = model(
+                            tokens,
+                            position_ids,
+                            attention_mask,
+                            return_memory=True)
+                        next_token_logits = next_token_logits[:, -1]
+                        position_ids = tokens.new_ones(batch_size, 2, 1)
+                        for i, text in enumerate(tokens.tolist()):
+                            mask_pos = mask_positions[i][current_mask[i]]
+                            position_ids[i, 0] = mask_pos
+                        tokens = tokens.new_zeros(batch_size, 0)
+                        attention_mask = tokens.new_zeros(batch_size)
+                    else:
+                        position_ids[:, 1] = position_ids[:, 1] + 1
+                        last_token = tokens[:, -1:]
+                        next_token_logits, *mems = model(
+                            last_token,
+                            position_ids,
+                            attention_mask,
+                            *mems,
+                            return_memory=True)
+                        next_token_logits = next_token_logits[:, -1]
+                    next_token_scores = F.log_softmax(
+                        next_token_logits, dim=-1)
+                    next_token_scores = self.processors(
+                        tokens, next_token_scores)
+                    next_tokens = next_token_scores.max(dim=-1)[1]
+                    # print(self.tokenizer.DecodeIds(next_tokens.tolist()))
+                    for i, next_token in enumerate(next_tokens.tolist()):
+                        if next_token == self.end_token:
+                            if current_mask[i] + 1 < len(mask_positions[i]):
+                                current_mask[i] += 1
+                                next_tokens[i] = self.start_token
+                                position_ids[i, 0] = mask_positions[i][
+                                    current_mask[i]]
+                                position_ids[i, 1] = 0
+                            else:
+                                done[i] = True
+                        if done[i]:
+                            next_tokens[i] = self.pad_token
+                    if all(done):
+                        break
+                    tokens = torch.cat(
+                        [tokens, next_tokens.unsqueeze(-1)], dim=-1)
+                    counter += 1
+                predictions = []
+                for i, text in enumerate(tokens.tolist()):
+                    text = [
+                        token for token in text
+                        if token not in [self.end_token, self.pad_token]
+                    ]
+                    blanks = [[]]
+                    for token in text:
+                        if token == self.start_token:
+                            blanks.append([])
+                        else:
+                            blanks[-1].append(token)
+                    output_tokens = []
+                    current_blank = 0
+                    for token in src_tokens[i].tolist():
+                        if token == self.mask_token:
+                            if current_blank < len(blanks):
+                                output_tokens += blanks[current_blank]
+                            current_blank += 1
+                        else:
+                            if token not in [self.pad_token]:
+                                output_tokens.append(token)
+                    text = self.tokenizer.DecodeIds(output_tokens[:-1])
+                    text = blanklm_fix_tokenization(text)
+                    predictions.append(text)
+                    # print(text)
+                uid_list = data['uid']
+                if isinstance(uid_list, torch.Tensor):
+                    uid_list = uid_list.cpu().numpy().tolist()
+                for uid, prediction in zip(uid_list, predictions):
+                    store.set(uid, prediction)
+                if (idx + 1) % args.log_interval == 0:
+                    print_rank_0(f'Iteration {idx + 1} / {len(dataloader)}')
+
+        model.train()
+        torch.distributed.barrier()
+        print_rank_0('Evaluation completed')
+        predictions, examples = [], []
+        for uid, example in example_dict.items():
+            predictions.append(store.get(uid).decode('utf-8'))
+            examples.append(example)
+        torch.distributed.barrier()
+        return predictions, [], examples
diff --git a/modelscope/models/nlp/mglm/tasks/seq2seq/finetune.py b/modelscope/models/nlp/mglm/tasks/seq2seq/finetune.py
new file mode 100644
index 00000000..4c0c28e7
--- /dev/null
+++ b/modelscope/models/nlp/mglm/tasks/seq2seq/finetune.py
@@ -0,0 +1,151 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Race."""
+import functools
+from collections import OrderedDict
+
+import mpu
+import torch
+from finetune_glm import finetune
+from pretrain_glm import get_batch
+from tasks.eval_utils import accuracy_func_provider
+from tasks.seq2seq.dataset import (BlankLMDataset, ExtractionDataset,
+                                   Seq2SeqDataset)
+from tasks.seq2seq.evaluate import (BlankLMEvaluater, DecoderEvaluater,
+                                    rouge_metric)
+
+global_tokenizer = None
+
+
+def seq2seq_forward_step(data, model, args, timers, mems):
+    """Forward step."""
+
+    # Get the batch.
+    if timers is not None:
+        timers('batch generator').start()
+    tokens, labels, loss_mask, attention_mask, position_ids = get_batch(
+        data, args)
+    if timers is not None:
+        timers('batch generator').stop()
+    # Forward model.
+    logits, *mems = model(tokens, position_ids, attention_mask, *mems)
+    # logits, loss_mask = logits[:, args.src_seq_length:], loss_mask[:, args.src_seq_length:]
+    # target_ids = target_ids[:, args.src_seq_length:]
+    losses = mpu.vocab_parallel_cross_entropy(logits.contiguous().float(),
+                                              labels)
+    if args.label_smoothing > 0.0:
+        epsilon = args.label_smoothing
+        smooth_loss = -torch.nn.functional.log_softmax(
+            logits, dim=-1).mean(dim=-1)
+        losses = (1 - epsilon) * losses + epsilon * smooth_loss
+    loss_mask = loss_mask.reshape(-1)
+    # The loss is not normalized for fair comparison
+    loss = torch.sum(losses.reshape(-1) * loss_mask) / loss_mask.sum()
+    return loss, mems, 'bert'
+
+
+def train_valid_datasets_provider(args, tokenizer):
+    """Provide train and validation datasets."""
+    if args.task.lower() == 'blank':
+        train_dataset = BlankLMDataset(
+            args, split='train', tokenizer=tokenizer)
+        valid_dataset = None
+    elif args.task.lower() == 'extraction':
+        train_dataset = ExtractionDataset(
+            args, split='train', tokenizer=tokenizer)
+        valid_dataset = None
+    else:
+        train_dataset = Seq2SeqDataset(
+            args, split='train', tokenizer=tokenizer)
+        valid_dataset = None
+    global global_tokenizer
+    global_tokenizer = tokenizer
+    return train_dataset, valid_dataset
+
+
+def metrics_func_provider(args, tokenizer, is_test):
+    """Provide metrics callback function."""
+
+    def single_dataset_provider(split):
+        if args.task.lower() == 'blank':
+            return BlankLMDataset(args, split=split, tokenizer=tokenizer)
+        elif args.task.lower() == 'extraction':
+            return ExtractionDataset(args, split=split, tokenizer=tokenizer)
+        else:
+            return Seq2SeqDataset(args, split=split, tokenizer=tokenizer)
+
+    if args.task.lower() in ['blank', 'extraction']:
+        evaluater = BlankLMEvaluater(args, tokenizer)
+        eval_func = evaluater.evaluate
+        metric_dict = {}
+    else:
+        evaluater = DecoderEvaluater(args, tokenizer)
+        eval_func = evaluater.evaluate
+        if args.tokenizer_type == 'BertWordPieceTokenizer':
+            dataset = 'cnn_dm'
+        elif args.task.lower() == 'gigaword':
+            dataset = 'gigaword'
+        else:
+            dataset = 'cnn_dm_org'
+        metric_dict = OrderedDict({
+            'rouge-1':
+            functools.partial(rouge_metric, metric='rouge-1', dataset=dataset),
+            'rouge-2':
+            functools.partial(rouge_metric, metric='rouge-2', dataset=dataset),
+            'rouge-l':
+            functools.partial(rouge_metric, metric='rouge-l', dataset=dataset)
+        })
+
+    def output_func(predictions, examples, output_file):
+        with open(output_file + '.hyps', 'w', encoding='utf-8') as output:
+            for prediction in predictions:
+                output.write(prediction)
+                output.write('\n')
+        with open(output_file + '.refs', 'w', encoding='utf-8') as output:
+            for example in examples:
+                output.write(example.meta['ref'])
+                output.write('\n')
+        if args.task.lower() == 'squad_generation':
+            with open(
+                    output_file + '.source', 'w', encoding='utf-8') as output:
+                for example in examples:
+                    output.write(
+                        example.text_a.replace('\n', ' ') + ' Answer: '
+                        + example.meta['answer'])
+                    output.write('\n')
+
+    return accuracy_func_provider(
+        single_dataset_provider,
+        metric_dict,
+        args,
+        is_test=is_test,
+        eval_func=eval_func,
+        output_func=output_func,
+        only_rank0=False)
+
+
+def main(args):
+    if args.src_seq_length > args.max_position_embeddings:
+        args.max_position_embeddings = args.src_seq_length
+    if args.task.lower() in [
+            'cnn_dm', 'cnn_dm_original', 'gigaword', 'blank',
+            'squad_generation', 'xsum', 'extraction'
+    ]:
+        finetune(
+            args,
+            train_valid_datasets_provider, {},
+            end_of_epoch_callback_provider=metrics_func_provider,
+            forward_step=seq2seq_forward_step)
+    else:
+        raise NotImplementedError(args.task)
diff --git a/modelscope/models/nlp/mglm/tasks/superglue/README.md b/modelscope/models/nlp/mglm/tasks/superglue/README.md
new file mode 100644
index 00000000..94aab0e9
--- /dev/null
+++ b/modelscope/models/nlp/mglm/tasks/superglue/README.md
@@ -0,0 +1,137 @@
+# Use GLM for your NLU tasks
+To use GLM for your own NLU tasks, you should implement a subclass of `DataProcessor` in [tasks/superglue/dataset.py](dataset.py) and a subclass of `PVP` in [tasks/superglue/pvp.py](pvp.py). You should also specify the  We will take the RTE and ReCoRD tasks in SuperGLUE as an example.
+
+## 1. Design your patterns
+RTE is an NLI task in which the model is required to predict text entailment between a premise and a hypothesis. The label can be `entailment` or `not_entailment` One sample from the training set is
+```
+premise: No Weapons of Mass Destruction Found in Iraq Yet.
+hypothesis: Weapons of Mass Destruction Found in Iraq.
+label: not_entailment
+```
+We design the pattern as
+```
+"`hypothesis`"?, [MASK], "`premise`"
+```
+GLM predicts "Yes" for `entailment` and "No" for `not_entailment`. "Yes" and "No" are called verbalizers for `entailment` and `not_entailment`.
+
+ReCoRD is a multi-choice QA task. Each example consists of a news article and a Cloze-style question about the article in which one entity is masked out. The system must predict the masked out entity from a list of possible entities in the provided passage. We directly adopt the cloze-style question as our pattern and use GLM to predict the masked entity.
+
+## 2. Implement subclass of `DataProcessor`
+A subclass of `DataProcessor` should implement `get_train_examples`, `get_dev_examples` and `get_test_examples`, which return the examples of the train, dev, and test sets. The returned value is a list of `InputExample`. It should also implement `get_labels` to return the list of possible labels. Hete we take the `RTEProcessor` as an example:
+```python
+class RteProcessor(DataProcessor):
+    """Processor for the RTE data set."""
+
+    def get_train_examples(self, data_dir):
+        return self._create_examples(os.path.join(data_dir, "train.jsonl"), "train")
+
+    def get_dev_examples(self, data_dir, for_train=False):
+        return self._create_examples(os.path.join(data_dir, "val.jsonl"), "dev")
+
+    def get_test_examples(self, data_dir):
+        return self._create_examples(os.path.join(data_dir, "test.jsonl"), "test")
+
+    def get_unlabeled_examples(self, data_dir):
+        return self._create_examples(os.path.join(data_dir, "unlabeled.jsonl"), "unlabeled")
+
+    def get_labels(self):
+        return ["entailment", "not_entailment"]
+
+    def _create_examples(self, path: str, set_type: str, hypothesis_name: str = "hypothesis",
+                         premise_name: str = "premise") -> List[InputExample]:
+        examples = []
+
+        with open(path, encoding='utf8') as f:
+            for line_idx, line in enumerate(f):
+                example_json = json.loads(line)
+                idx = example_json['idx']
+                if isinstance(idx, str):
+                    try:
+                        idx = int(idx)
+                    except ValueError:
+                        idx = line_idx
+                label = example_json.get('label')
+                guid = "%s-%s" % (set_type, idx)
+                text_a = example_json[premise_name]
+                text_b = example_json[hypothesis_name]
+
+                example = InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, idx=idx)
+                examples.append(example)
+
+        return examples
+```
+After that, you should add the implemented class to ``PROCESSORS`` at the end of [tasks/superglue/dataset.py](dataset.py):
+```python
+PROCESSORS = {
+    ...
+    "rte": RteProcessor
+}
+```
+
+## 3. Implement subclass of `PVP`
+To implement a subclass of `PVP`, you should first decide your verbalizers is single-token or multi-token. The verbalizers in RTE, "Yes" and "No" are single-token. Instead, the verbalizers in ReCoRD are multi-token, as one entity can be tokenized into multiple tokens with WordPiece or BPE tokenizer.
+
+For single-token task, you should set `is_multi_token=False` in the class definition. You should implement `get_parts` to return the inputs to GLM given an example and `verbalize` to return the verbalizer given a label. Take `RTEPVP` as an example:
+```python
+class RtePVP(PVP):
+    is_multi_token = False
+    VERBALIZER = {
+        "not_entailment": [" No"],
+        "entailment": [" Yes"]
+    }
+
+    @property
+    def spell_length(self):
+        return self.pattern_id
+
+    def get_parts(self, example: InputExample) -> FilledPattern:
+        # switch text_a and text_b to get the correct order
+        text_a = example.text_a
+        text_b = example.text_b.rstrip(string.punctuation)
+        return ['"', self.shortenable(text_b), '" ?'], [[self.mask], ', "', self.shortenable(text_a), '"']
+
+    def verbalize(self, label) -> List[str]:
+        return RtePVP.VERBALIZER[label]
+```
+We use `PvP.shortenable` to mark the segments that can be truncated when exceeding the maximum sequence length.
+
+For multi-token task, you should set `is_multi_token=True` in the class definition. You should implement `get_parts` to return the inputs to GLM given an example and `get_answers` to return the candidates. Take `ReCoRDPVP` as an example:
+```python
+class RecordPVP(PVP):
+    is_multi_token = True
+
+    def get_answers(self, example: InputExample):
+        choices = example.meta['candidates']
+        choices = [" " + choice for choice in choices]
+        return choices
+
+    def get_parts(self, example: InputExample) -> FilledPattern:
+        premise = self.shortenable(example.text_a)
+
+        assert '@placeholder' in example.text_b, f'question "{example.text_b}" does not contain a @placeholder token'
+        question_a, question_b = example.text_b.split('@placeholder')
+        return [premise, " " + question_a.rstrip(), [self.mask], question_b], []
+```
+After that, you should implement the class to `PVPS` at the end of [tasks/superglue/pvp.py](pvp.py):
+```python
+PVPS = {
+    ...
+    'rte': RtePVP,
+    'record': RecordPVP
+}
+```
+## 4. Run the experiment
+To run the experiment for your new task, you should create a config file like [config_tasks/task_rte.sh](/config_tasks/task_rte.sh). You should also specify the evaluation metrics for the task in `DEFAULT_METRICS` of [tasks/superglue/finetune.py](finetune.py):
+```python
+DEFAULT_METRICS = {
+    ...
+    "record": [("EM", qa_exact_match), ("F1", qa_f1)],
+    "rte": [("accuracy", accuracy_metric)]
+}
+```
+Then you can run the experiment with [finetune_superglue.sh](/scripts/finetune_superglue.sh):
+```shell
+bash scripts/finetune_superglue.sh \
+     config_tasks/model_blocklm_large.sh \
+     config_tasks/task_rte.sh
+```
diff --git a/modelscope/models/nlp/mglm/tasks/superglue/__init__.py b/modelscope/models/nlp/mglm/tasks/superglue/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/nlp/mglm/tasks/superglue/dataset.py b/modelscope/models/nlp/mglm/tasks/superglue/dataset.py
new file mode 100644
index 00000000..36367671
--- /dev/null
+++ b/modelscope/models/nlp/mglm/tasks/superglue/dataset.py
@@ -0,0 +1,1475 @@
+# Copyright (c) 2022 Zhipu.AI
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This file contains the logic for loading training and test data for all tasks.
+"""
+
+import copy
+import csv
+import glob
+import os
+import random
+import re
+from abc import ABC, abstractmethod
+from collections import Counter, defaultdict
+from typing import Callable, Dict, List
+
+import json
+import numpy as np
+import pandas as pd
+from data_utils import (build_input_from_ids, build_sample,
+                        num_special_tokens_to_add)
+from data_utils.corpora import punctuation_standardization
+from torch.utils.data import Dataset
+from tqdm import tqdm
+from utils import print_rank_0
+
+from modelscope.models.nlp.mglm.tasks.data_utils import InputExample
+from modelscope.models.nlp.mglm.tasks.superglue.pvp import PVPS
+
+TRAIN_SET = 'train'
+DEV_SET = 'dev'
+TEST_SET = 'test'
+TRUE_DEV_SET = 'true_dev'
+UNLABELED_SET = 'unlabeled'
+
+SPLIT_TYPES = [TRAIN_SET, DEV_SET, TEST_SET, TRUE_DEV_SET, UNLABELED_SET]
+
+
+def get_output_func(task_name, args):
+    return PROCESSORS[task_name](args).output_prediction
+
+
+def read_tsv(path, **kwargs):
+    return pd.read_csv(
+        path,
+        sep='\t',
+        quoting=csv.QUOTE_NONE,
+        dtype=str,
+        na_filter=False,
+        **kwargs)
+
+
+class SuperGlueDataset(Dataset):
+
+    def __init__(self,
+                 args,
+                 task_name,
+                 data_dir,
+                 seq_length,
+                 split,
+                 tokenizer,
+                 for_train=False,
+                 pattern_ensemble=False,
+                 pattern_text=False):
+        self.processor = PROCESSORS[task_name](args)
+        args.variable_num_choices = self.processor.variable_num_choices
+        print_rank_0(
+            f'Creating {task_name} dataset from file at {data_dir} (split={split})'
+        )
+        self.dataset_name = f'{task_name}-{split}'
+        self.cloze_eval = args.cloze_eval
+        self.seq_length = seq_length
+        self.tokenizer = tokenizer
+        self.pattern_ensemble = pattern_ensemble
+        self.pattern_text = pattern_text
+        if pattern_text:
+            assert self.cloze_eval, 'Labeled examples only exist in cloze evaluation'
+        self.args = args
+        if split == DEV_SET:
+            example_list = self.processor.get_dev_examples(
+                data_dir, for_train=for_train)
+        elif split == TEST_SET:
+            example_list = self.processor.get_test_examples(data_dir)
+        elif split == TRUE_DEV_SET:
+            example_list = self.processor.get_true_dev_examples(data_dir)
+        elif split == TRAIN_SET:
+            if task_name == 'wsc':
+                example_list = self.processor.get_train_examples(
+                    data_dir, cloze_eval=args.cloze_eval)
+            else:
+                example_list = self.processor.get_train_examples(data_dir)
+        elif split == UNLABELED_SET:
+            example_list = self.processor.get_unlabeled_examples(data_dir)
+            for example in example_list:
+                example.label = self.processor.get_labels()[0]
+        else:
+            raise ValueError(
+                f"'split' must be one of {SPLIT_TYPES}, got '{split}' instead")
+        if split == TEST_SET:
+            self.labeled = False
+        else:
+            self.labeled = True
+
+        label_distribution = Counter(example.label for example in example_list)
+        print_rank_0(
+            f'Returning {len(example_list)} {split} examples with label dist.: {list(label_distribution.items())}'
+        )
+        self.samples = []
+        example_list.sort(key=lambda x: x.num_choices)
+        self.example_list = example_list
+        if self.cloze_eval:
+            if self.pattern_ensemble:
+                pattern_ids = PVPS[task_name].available_patterns()
+                self.pvps = []
+                for pattern_id in pattern_ids:
+                    self.pvps.append(PVPS[task_name](
+                        args,
+                        tokenizer,
+                        self.processor.get_labels(),
+                        seq_length,
+                        pattern_id=pattern_id,
+                        num_prompt_tokens=args.num_prompt_tokens,
+                        is_multi_token=args.multi_token,
+                        max_segment_length=args.segment_length,
+                        fast_decode=args.fast_decode,
+                        split=split))
+            else:
+                self.pvp = PVPS[task_name](
+                    args,
+                    tokenizer,
+                    self.processor.get_labels(),
+                    seq_length,
+                    pattern_id=args.pattern_id,
+                    num_prompt_tokens=args.num_prompt_tokens,
+                    is_multi_token=args.multi_token,
+                    max_segment_length=args.segment_length,
+                    fast_decode=args.fast_decode,
+                    split=split)
+        self.examples = {example.guid: example for example in example_list}
+
+    def __len__(self):
+        if self.cloze_eval and self.pattern_ensemble:
+            return len(self.example_list) * len(self.pvps)
+        else:
+            return len(self.example_list)
+
+    def __getitem__(self, idx):
+        sample_idx = idx % len(self.example_list)
+        example = self.example_list[sample_idx]
+        if self.cloze_eval:
+            kwargs = {}
+            if self.pattern_text:
+                kwargs = {'labeled': True, 'priming': True}
+            if self.pattern_ensemble:
+                pvp_idx = idx // len(self.example_list)
+                sample = self.pvps[pvp_idx].encode(example, **kwargs)
+            else:
+                sample = self.pvp.encode(example, **kwargs)
+            if self.pattern_text:
+                eos_id = self.tokenizer.get_command('eos').Id
+                cls_id = self.tokenizer.get_command('ENC').Id
+                input_ids = [cls_id] + sample + [eos_id]
+                sample = {
+                    'text': input_ids,
+                    'loss_mask': np.array([1] * len(input_ids))
+                }
+        else:
+            sample = self.processor.encode(example, self.tokenizer,
+                                           self.seq_length, self.args)
+        return sample
+
+
+class DataProcessor(ABC):
+    """
+    Abstract class that provides methods for loading training, testing, development and unlabeled examples for a given
+    task
+    """
+
+    def __init__(self, args):
+        self.args = args
+        self.num_truncated = 0
+
+    def output_prediction(self, predictions, examples, output_file):
+        with open(output_file, 'w') as output:
+            for prediction, example in zip(predictions, examples):
+                prediction = self.get_labels()[prediction]
+                data = {'idx': example.idx, 'label': prediction}
+                output.write(json.dumps(data) + '\n')
+
+    @property
+    def variable_num_choices(self):
+        return False
+
+    @abstractmethod
+    def get_train_examples(self, data_dir) -> List[InputExample]:
+        """Get a collection of `InputExample`s for the train set."""
+        pass
+
+    @abstractmethod
+    def get_dev_examples(self,
+                         data_dir,
+                         for_train=False) -> List[InputExample]:
+        """Get a collection of `InputExample`s for the dev set."""
+        pass
+
+    def get_test_examples(self, data_dir) -> List[InputExample]:
+        """Get a collection of `InputExample`s for the test set."""
+        return []
+
+    def get_unlabeled_examples(self, data_dir) -> List[InputExample]:
+        """Get a collection of `InputExample`s for the unlabeled set."""
+        return []
+
+    @abstractmethod
+    def get_labels(self) -> List[str]:
+        """Get the list of labels for this data set."""
+        pass
+
+    def get_classifier_input(self, example: InputExample, tokenizer):
+        return example.text_a, example.text_b
+
+    def encode(self, example: InputExample, tokenizer, seq_length, args):
+        text_a, text_b = self.get_classifier_input(example, tokenizer)
+        tokens_a = tokenizer.EncodeAsIds(text_a).tokenization
+        tokens_b = tokenizer.EncodeAsIds(text_b).tokenization
+        num_special_tokens = num_special_tokens_to_add(
+            tokens_a,
+            tokens_b,
+            None,
+            add_cls=True,
+            add_sep=True,
+            add_piece=False)
+        if len(tokens_a) + len(tokens_b) + num_special_tokens > seq_length:
+            self.num_truncated += 1
+        data = build_input_from_ids(
+            tokens_a,
+            tokens_b,
+            None,
+            seq_length,
+            tokenizer,
+            args=args,
+            add_cls=True,
+            add_sep=True,
+            add_piece=False)
+        ids, types, paddings, position_ids, sep, target_ids, loss_masks = data
+        label = 0
+        if example.label is not None:
+            label = example.label
+            label = self.get_labels().index(label)
+        if args.pretrained_bert:
+            sample = build_sample(
+                ids,
+                label=label,
+                types=types,
+                paddings=paddings,
+                unique_id=example.guid)
+        else:
+            sample = build_sample(
+                ids,
+                positions=position_ids,
+                masks=sep,
+                label=label,
+                unique_id=example.guid)
+        return sample
+
+
+class SuperGLUEProcessor(DataProcessor):
+
+    def __init__(self, args):
+        super(SuperGLUEProcessor, self).__init__(args)
+        self.few_superglue = args.few_superglue
+
+    def get_train_examples(self, data_dir):
+        return self._create_examples(
+            os.path.join(data_dir, 'train.jsonl'), 'train')
+
+    def get_dev_examples(self, data_dir, for_train=False):
+        if self.few_superglue:
+            return self._create_examples(
+                os.path.join(data_dir, 'dev32.jsonl'), 'dev')
+        else:
+            return self._create_examples(
+                os.path.join(data_dir, 'val.jsonl'), 'dev')
+
+    def get_test_examples(self, data_dir):
+        if self.few_superglue:
+            return self._create_examples(
+                os.path.join(data_dir, 'val.jsonl'), 'test')
+        else:
+            return self._create_examples(
+                os.path.join(data_dir, 'test.jsonl'), 'test')
+
+    def get_unlabeled_examples(self, data_dir):
+        return self._create_examples(
+            os.path.join(data_dir, 'unlabeled.jsonl'), 'unlabeled')
+
+    def _create_examples(self, *args, **kwargs):
+        pass
+
+
+class RteProcessor(SuperGLUEProcessor):
+    """Processor for the RTE data set."""
+
+    def get_labels(self):
+        return ['entailment', 'not_entailment']
+
+    def _create_examples(self,
+                         path: str,
+                         set_type: str,
+                         hypothesis_name: str = 'hypothesis',
+                         premise_name: str = 'premise') -> List[InputExample]:
+        examples = []
+
+        with open(path, encoding='utf8') as f:
+            for line_idx, line in enumerate(f):
+                example_json = json.loads(line)
+                idx = example_json['idx']
+                if isinstance(idx, str):
+                    try:
+                        idx = int(idx)
+                    except ValueError:
+                        idx = line_idx
+                label = example_json.get('label')
+                guid = '%s-%s' % (set_type, idx)
+                text_a = punctuation_standardization(
+                    example_json[premise_name])
+                text_b = punctuation_standardization(
+                    example_json[hypothesis_name])
+
+                example = InputExample(
+                    guid=guid,
+                    text_a=text_a,
+                    text_b=text_b,
+                    label=label,
+                    idx=idx)
+                examples.append(example)
+
+        return examples
+
+
+class AxGProcessor(RteProcessor):
+    """Processor for the AX-G diagnostic data set."""
+
+    def get_train_examples(self, data_dir):
+        return self._create_examples(
+            os.path.join(data_dir, 'AX-g.jsonl'), 'train')
+
+    def get_test_examples(self, data_dir):
+        return self._create_examples(
+            os.path.join(data_dir, 'AX-g.jsonl'), 'test')
+
+
+class AxBProcessor(RteProcessor):
+    """Processor for the AX-B diagnostic data set."""
+
+    def get_train_examples(self, data_dir):
+        return self._create_examples(
+            os.path.join(data_dir, 'AX-b.jsonl'), 'train')
+
+    def get_test_examples(self, data_dir):
+        return self._create_examples(
+            os.path.join(data_dir, 'AX-b.jsonl'), 'test')
+
+    def _create_examples(self,
+                         path,
+                         set_type,
+                         hypothesis_name='sentence2',
+                         premise_name='sentence1'):
+        return super()._create_examples(path, set_type, hypothesis_name,
+                                        premise_name)
+
+
+class CbProcessor(RteProcessor):
+    """Processor for the CB data set."""
+
+    def get_labels(self):
+        return ['entailment', 'contradiction', 'neutral']
+
+
+class WicProcessor(SuperGLUEProcessor):
+    """Processor for the WiC data set."""
+
+    def get_labels(self):
+        return ['false', 'true']
+
+    @staticmethod
+    def _create_examples(path: str, set_type: str) -> List[InputExample]:
+        examples = []
+        with open(path, encoding='utf8') as f:
+            for line in f:
+                example_json = json.loads(line)
+                idx = example_json['idx']
+                if isinstance(idx, str):
+                    idx = int(idx)
+                label = 'true' if example_json.get('label') else 'false'
+                guid = '%s-%s' % (set_type, idx)
+                text_a = punctuation_standardization(example_json['sentence1'])
+                text_b = punctuation_standardization(example_json['sentence2'])
+                meta = {'word': example_json['word']}
+                example = InputExample(
+                    guid=guid,
+                    text_a=text_a,
+                    text_b=text_b,
+                    label=label,
+                    idx=idx,
+                    meta=meta)
+                examples.append(example)
+        return examples
+
+    def get_classifier_input(self, example: InputExample, tokenizer):
+        text_a = example.meta['word'] + ': ' + example.text_a
+        return text_a, example.text_b
+
+
+class WscProcessor(SuperGLUEProcessor):
+    """Processor for the WSC data set."""
+
+    @property
+    def variable_num_choices(self):
+        return self.args.wsc_negative
+
+    def get_train_examples(self, data_dir, cloze_eval=True):
+        return self._create_examples(
+            os.path.join(data_dir, 'train.jsonl'),
+            'train',
+            cloze_eval=cloze_eval)
+
+    def get_labels(self):
+        return ['False', 'True']
+
+    def get_classifier_input(self, example: InputExample, tokenizer):
+        target = example.meta['span1_text']
+        pronoun_idx = example.meta['span2_index']
+
+        # mark the pronoun with asterisks
+        words_a = example.text_a.split()
+        words_a[pronoun_idx] = '*' + words_a[pronoun_idx] + '*'
+        text_a = ' '.join(words_a)
+        text_b = target
+        return text_a, text_b
+
+    def _create_examples(self,
+                         path: str,
+                         set_type: str,
+                         cloze_eval=True) -> List[InputExample]:
+        examples = []
+
+        with open(path, encoding='utf8') as f:
+            for line in f:
+                example_json = json.loads(line)
+                idx = example_json['idx']
+                label = str(
+                    example_json['label']) if 'label' in example_json else None
+                guid = '%s-%s' % (set_type, idx)
+                text_a = punctuation_standardization(example_json['text'])
+                meta = {
+                    'span1_text': example_json['target']['span1_text'],
+                    'span2_text': example_json['target']['span2_text'],
+                    'span1_index': example_json['target']['span1_index'],
+                    'span2_index': example_json['target']['span2_index']
+                }
+                if 'candidates' in example_json:
+                    candidates = [
+                        cand['text'] for cand in example_json['candidates']
+                    ]
+                    # candidates = list(set(candidates))
+                    filtered = []
+                    for i, cand in enumerate(candidates):
+                        if cand not in candidates[:i]:
+                            filtered.append(cand)
+                    candidates = filtered
+
+                # the indices in the dataset are wrong for some examples, so we manually fix them
+                span1_index, span1_text = meta['span1_index'], meta[
+                    'span1_text']
+                span2_index, span2_text = meta['span2_index'], meta[
+                    'span2_text']
+                words_a = text_a.split()
+                words_a_lower = text_a.lower().split()
+                words_span1_text = span1_text.lower().split()
+                span1_len = len(words_span1_text)
+
+                if words_a_lower[span1_index:span1_index
+                                 + span1_len] != words_span1_text:
+                    for offset in [-1, +1]:
+                        if words_a_lower[span1_index + offset:span1_index
+                                         + span1_len
+                                         + offset] == words_span1_text:
+                            span1_index += offset
+
+                # if words_a_lower[span1_index:span1_index + span1_len] != words_span1_text:
+                #     print_rank_0(f"Got '{words_a_lower[span1_index:span1_index + span1_len]}' but expected "
+                #                  f"'{words_span1_text}' at index {span1_index} for '{words_a}'")
+
+                if words_a[span2_index] != span2_text:
+                    for offset in [-1, +1]:
+                        if words_a[span2_index + offset] == span2_text:
+                            span2_index += offset
+
+                    if words_a[span2_index] != span2_text and words_a[
+                            span2_index].startswith(span2_text):
+                        words_a = words_a[:span2_index] \
+                                  + [words_a[span2_index][:len(span2_text)], words_a[span2_index][len(span2_text):]] + words_a[span2_index + 1:] # noqa
+
+                assert words_a[span2_index] == span2_text, \
+                    f"Got '{words_a[span2_index]}' but expected '{span2_text}' at index {span2_index} for '{words_a}'"
+
+                text_a = ' '.join(words_a)
+                meta['span1_index'], meta[
+                    'span2_index'] = span1_index, span2_index
+
+                if self.args.task == 'wsc1':
+                    example = InputExample(
+                        guid=guid,
+                        text_a=text_a,
+                        text_b=span1_text,
+                        label=label,
+                        meta=meta,
+                        idx=idx)
+                    examples.append(example)
+                    if set_type == 'train' and label == 'True':
+                        for cand in candidates:
+                            example = InputExample(
+                                guid=guid,
+                                text_a=text_a,
+                                text_b=cand,
+                                label='False',
+                                meta=meta,
+                                idx=idx)
+                            examples.append(example)
+                    continue
+
+                if cloze_eval and set_type == 'train' and label != 'True':
+                    continue
+                if set_type == 'train' and 'candidates' in example_json and len(
+                        candidates) > 9:
+                    for i in range(0, len(candidates), 9):
+                        _meta = copy.deepcopy(meta)
+                        _meta['candidates'] = candidates[i:i + 9]
+                        if len(_meta['candidates']) < 9:
+                            _meta['candidates'] += candidates[:9 - len(
+                                _meta['candidates'])]
+                        example = InputExample(
+                            guid=guid,
+                            text_a=text_a,
+                            label=label,
+                            meta=_meta,
+                            idx=idx)
+                        examples.append(example)
+                else:
+                    if 'candidates' in example_json:
+                        meta['candidates'] = candidates
+                    example = InputExample(
+                        guid=guid,
+                        text_a=text_a,
+                        label=label,
+                        meta=meta,
+                        idx=idx)
+                    examples.append(example)
+
+        return examples
+
+
+class BoolQProcessor(SuperGLUEProcessor):
+    """Processor for the BoolQ data set."""
+
+    def get_labels(self):
+        return ['false', 'true']
+
+    @staticmethod
+    def _create_examples(path: str, set_type: str) -> List[InputExample]:
+        examples = []
+
+        with open(path, encoding='utf8') as f:
+            for line in f:
+                example_json = json.loads(line)
+                idx = example_json['idx']
+                label = str(example_json['label']).lower(
+                ) if 'label' in example_json else None
+                guid = '%s-%s' % (set_type, idx)
+                text_a = punctuation_standardization(example_json['passage'])
+                text_b = punctuation_standardization(example_json['question'])
+                example = InputExample(
+                    guid=guid,
+                    text_a=text_a,
+                    text_b=text_b,
+                    label=label,
+                    idx=idx)
+                examples.append(example)
+
+        return examples
+
+
+class CopaProcessor(SuperGLUEProcessor):
+    """Processor for the COPA data set."""
+
+    def get_labels(self):
+        return [0, 1]
+
+    def encode(self, example: InputExample, tokenizer, seq_length, args):
+        if args.pretrained_bert:
+            ids_list, types_list, paddings_list = [], [], []
+        else:
+            ids_list, positions_list, sep_list = [], [], []
+        question = example.meta['question']
+        joiner = 'because' if question == 'cause' else 'so'
+        text_a = punctuation_standardization(example.text_a) + ' ' + joiner
+        tokens_a = tokenizer.EncodeAsIds(text_a).tokenization
+        for choice in [example.meta['choice1'], example.meta['choice2']]:
+            choice = punctuation_standardization(choice)
+            tokens_b = tokenizer.EncodeAsIds(choice).tokenization
+            num_special_tokens = num_special_tokens_to_add(
+                tokens_a,
+                tokens_b,
+                None,
+                add_cls=True,
+                add_sep=True,
+                add_piece=False)
+            if len(tokens_a) + len(tokens_b) + num_special_tokens > seq_length:
+                self.num_truncated += 1
+            data = build_input_from_ids(
+                tokens_a,
+                tokens_b,
+                None,
+                seq_length,
+                tokenizer,
+                args,
+                add_cls=True,
+                add_sep=True,
+                add_piece=False)
+            ids, types, paddings, position_ids, sep, target_ids, loss_masks = data
+            if args.pretrained_bert:
+                ids_list.append(ids)
+                types_list.append(types)
+                paddings_list.append(paddings)
+            else:
+                ids_list.append(ids)
+                positions_list.append(position_ids)
+                sep_list.append(sep)
+        label = 0
+        if example.label is not None:
+            label = example.label
+            label = self.get_labels().index(label)
+        if args.pretrained_bert:
+            sample = build_sample(
+                ids_list,
+                label=label,
+                types=types_list,
+                paddings=paddings_list,
+                unique_id=example.guid)
+        else:
+            sample = build_sample(
+                ids_list,
+                positions=positions_list,
+                masks=sep_list,
+                label=label,
+                unique_id=example.guid)
+        return sample
+
+    @staticmethod
+    def _create_examples(path: str, set_type: str) -> List[InputExample]:
+        examples = []
+
+        with open(path, encoding='utf8') as f:
+            for line in f:
+                example_json = json.loads(line)
+                label = example_json[
+                    'label'] if 'label' in example_json else None
+                idx = example_json['idx']
+                guid = '%s-%s' % (set_type, idx)
+                text_a = example_json['premise']
+                meta = {
+                    'choice1': example_json['choice1'],
+                    'choice2': example_json['choice2'],
+                    'question': example_json['question']
+                }
+                example = InputExample(
+                    guid=guid, text_a=text_a, label=label, meta=meta, idx=idx)
+                examples.append(example)
+
+        if set_type == 'train' or set_type == 'unlabeled':
+            mirror_examples = []
+            for ex in examples:
+                label = 1 if ex.label == 0 else 0
+                meta = {
+                    'choice1': ex.meta['choice2'],
+                    'choice2': ex.meta['choice1'],
+                    'question': ex.meta['question']
+                }
+                mirror_example = InputExample(
+                    guid=ex.guid + 'm',
+                    text_a=ex.text_a,
+                    label=label,
+                    meta=meta)
+                mirror_examples.append(mirror_example)
+            examples += mirror_examples
+            print_rank_0(
+                f'Added {len(mirror_examples)} mirror examples, total size is {len(examples)}...'
+            )
+        return examples
+
+
+class MultiRcProcessor(SuperGLUEProcessor):
+    """Processor for the MultiRC data set."""
+
+    def get_labels(self):
+        return [0, 1]
+
+    @staticmethod
+    def _create_examples(path: str, set_type: str) -> List[InputExample]:
+        examples = []
+
+        with open(path, encoding='utf8') as f:
+            for line in f:
+                example_json = json.loads(line)
+
+                passage_idx = example_json['idx']
+                text = punctuation_standardization(
+                    example_json['passage']['text'])
+                questions = example_json['passage']['questions']
+                for question_json in questions:
+                    question = punctuation_standardization(
+                        question_json['question'])
+                    question_idx = question_json['idx']
+                    answers = question_json['answers']
+                    for answer_json in answers:
+                        label = answer_json[
+                            'label'] if 'label' in answer_json else None
+                        answer_idx = answer_json['idx']
+                        guid = f'{set_type}-p{passage_idx}-q{question_idx}-a{answer_idx}'
+                        meta = {
+                            'passage_idx':
+                            passage_idx,
+                            'question_idx':
+                            question_idx,
+                            'answer_idx':
+                            answer_idx,
+                            'answer':
+                            punctuation_standardization(answer_json['text'])
+                        }
+                        idx = [passage_idx, question_idx, answer_idx]
+                        example = InputExample(
+                            guid=guid,
+                            text_a=text,
+                            text_b=question,
+                            label=label,
+                            meta=meta,
+                            idx=idx)
+                        examples.append(example)
+
+        question_indices = list(
+            set(example.meta['question_idx'] for example in examples))
+        label_distribution = Counter(example.label for example in examples)
+        print_rank_0(
+            f'Returning {len(examples)} examples corresponding to {len(question_indices)} questions with label '
+            f'distribution {list(label_distribution.items())}')
+        return examples
+
+    def output_prediction(self, predictions, examples, output_file):
+        with open(output_file, 'w') as output:
+            passage_dict = defaultdict(list)
+            for prediction, example in zip(predictions, examples):
+                passage_dict[example.meta['passage_idx']].append(
+                    (prediction, example))
+            for passage_idx, data in passage_dict.items():
+                question_dict = defaultdict(list)
+                passage_data = {
+                    'idx': passage_idx,
+                    'passage': {
+                        'questions': []
+                    }
+                }
+                for prediction, example in data:
+                    question_dict[example.meta['question_idx']].append(
+                        (prediction, example))
+                for question_idx, data in question_dict.items():
+                    question_data = {'idx': question_idx, 'answers': []}
+                    for prediction, example in data:
+                        prediction = self.get_labels()[prediction]
+                        question_data['answers'].append({
+                            'idx':
+                            example.meta['answer_idx'],
+                            'label':
+                            prediction
+                        })
+                    passage_data['passage']['questions'].append(question_data)
+                output.write(json.dumps(passage_data) + '\n')
+
+    def get_classifier_input(self, example: InputExample, tokenizer):
+        text_a = example.text_a
+        text_b = ' '.join([example.text_b, 'answer:', example.meta['answer']])
+        return text_a, text_b
+
+
+class RaceProcessor(DataProcessor):
+
+    @property
+    def variable_num_choices(self):
+        return True
+
+    def get_labels(self):
+        return ['A', 'B', 'C', 'D']
+
+    def get_train_examples(self, data_dir):
+        return self._create_examples(os.path.join(data_dir, 'train'), 'train')
+
+    def get_dev_examples(self, data_dir, for_train=False):
+        return self._create_examples(
+            os.path.join(data_dir, 'dev'), 'dev', for_train=for_train)
+
+    def get_test_examples(self, data_dir):
+        return self._create_examples(os.path.join(data_dir, 'test'), 'test')
+
+    @staticmethod
+    def _create_examples(path,
+                         set_type,
+                         for_train=False) -> List[InputExample]:
+        examples = []
+
+        def clean_text(text):
+            """Remove new lines and multiple spaces and adjust end of sentence dot."""
+
+            text = text.replace('\n', ' ')
+            text = re.sub(r'\s+', ' ', text)
+            for _ in range(3):
+                text = text.replace(' . ', '. ')
+
+            return text
+
+        filenames = glob.glob(os.path.join(
+            path, 'middle', '*.txt')) + glob.glob(
+                os.path.join(path, 'high', '*.txt'))
+        for filename in filenames:
+            with open(filename, 'r') as f:
+                for line in f:
+                    data = json.loads(line)
+                    idx = data['id']
+                    context = data['article']
+                    questions = data['questions']
+                    choices = data['options']
+                    answers = data['answers']
+                    # Check the length.
+                    assert len(questions) == len(answers)
+                    assert len(questions) == len(choices)
+
+                    context = clean_text(context)
+                    for question_idx, question in enumerate(questions):
+                        answer = answers[question_idx]
+                        choice = choices[question_idx]
+                        guid = f'{set_type}-p{idx}-q{question_idx}'
+                        ex_idx = [set_type, idx, question_idx]
+                        meta = {'choices': choice}
+                        example = InputExample(
+                            guid=guid,
+                            text_a=context,
+                            text_b=question,
+                            label=answer,
+                            meta=meta,
+                            idx=ex_idx)
+                        examples.append(example)
+        return examples
+
+
+class RecordProcessor(SuperGLUEProcessor):
+    """Processor for the ReCoRD data set."""
+
+    def get_dev_examples(self, data_dir, for_train=False):
+        return self._create_examples(
+            os.path.join(data_dir, 'val.jsonl'), 'dev', for_train=for_train)
+
+    @property
+    def variable_num_choices(self):
+        return True
+
+    def get_labels(self):
+        return ['0', '1']
+
+    def output_prediction(self, predictions, examples, output_file):
+        with open(output_file, 'w') as output:
+            for prediction, example in zip(predictions, examples):
+                prediction = example.meta['candidates'][prediction]
+                data = {'idx': example.idx, 'label': prediction}
+                output.write(json.dumps(data) + '\n')
+
+    def encode(self, example: InputExample, tokenizer, seq_length, args):
+        if args.pretrained_bert:
+            ids_list, types_list, paddings_list = [], [], []
+        else:
+            ids_list, positions_list, sep_list = [], [], []
+        tokens_a = tokenizer.EncodeAsIds(example.text_a).tokenization
+        tokens_b = tokenizer.EncodeAsIds(
+            example.text_b).tokenization if example.text_b else None
+        for answer in example.meta['candidates']:
+            answer_ids = tokenizer.EncodeAsIds(answer).tokenization
+            total_length = len(tokens_a) + len(tokens_b) + len(answer_ids)
+            total_length += num_special_tokens_to_add(
+                tokens_a,
+                tokens_b + answer_ids,
+                None,
+                add_cls=True,
+                add_sep=True,
+                add_piece=False)
+            if total_length > seq_length:
+                self.num_truncated += 1
+            data = build_input_from_ids(
+                tokens_a,
+                tokens_b + answer_ids,
+                None,
+                seq_length,
+                tokenizer,
+                args,
+                add_cls=True,
+                add_sep=True,
+                add_piece=False)
+            ids, types, paddings, position_ids, sep, target_ids, loss_masks = data
+            if args.pretrained_bert:
+                ids_list.append(ids)
+                types_list.append(types)
+                paddings_list.append(paddings)
+            else:
+                ids_list.append(ids)
+                positions_list.append(position_ids)
+                sep_list.append(sep)
+        label = example.label
+        label = self.get_labels().index(label)
+        if args.pretrained_bert:
+            sample = build_sample(
+                ids_list,
+                label=label,
+                types=types_list,
+                paddings=paddings_list,
+                unique_id=example.guid)
+        else:
+            sample = build_sample(
+                ids_list,
+                positions=positions_list,
+                masks=sep_list,
+                label=label,
+                unique_id=example.guid)
+        return sample
+
+    @staticmethod
+    def _create_examples(path,
+                         set_type,
+                         seed=42,
+                         max_train_candidates_per_question: int = 10,
+                         for_train=False) -> List[InputExample]:
+        examples = []
+
+        entity_shuffler = random.Random(seed)
+
+        with open(path, encoding='utf8') as f:
+            for idx, line in enumerate(f):
+                example_json = json.loads(line)
+
+                idx = example_json['idx']
+                text = punctuation_standardization(
+                    example_json['passage']['text'])
+                entities = set()
+
+                for entity_json in example_json['passage']['entities']:
+                    start = entity_json['start']
+                    end = entity_json['end']
+                    entity = punctuation_standardization(text[start:end + 1])
+                    entities.add(entity)
+
+                entities = list(entities)
+                entities.sort()
+
+                text = text.replace(
+                    '@highlight\n', '- '
+                )  # we follow the GPT-3 paper wrt @highlight annotations
+                questions = example_json['qas']
+
+                for question_json in questions:
+                    question = punctuation_standardization(
+                        question_json['query'])
+                    question_idx = question_json['idx']
+                    answers = set()
+
+                    for answer_json in question_json.get('answers', []):
+                        answer = punctuation_standardization(
+                            answer_json['text'])
+                        answers.add(answer)
+
+                    answers = list(answers)
+
+                    if set_type == 'train' or for_train:
+                        # create a single example per *correct* answer
+                        for answer_idx, answer in enumerate(answers):
+                            candidates = [
+                                ent for ent in entities if ent not in answers
+                            ]
+                            if len(candidates
+                                   ) > max_train_candidates_per_question - 1:
+                                entity_shuffler.shuffle(candidates)
+                                candidates = candidates[:
+                                                        max_train_candidates_per_question
+                                                        - 1]
+
+                            guid = f'{set_type}-p{idx}-q{question_idx}-a{answer_idx}'
+                            meta = {
+                                'passage_idx': idx,
+                                'question_idx': question_idx,
+                                'candidates': [answer] + candidates,
+                                'answers': [answer]
+                            }
+                            ex_idx = [idx, question_idx, answer_idx]
+                            example = InputExample(
+                                guid=guid,
+                                text_a=text,
+                                text_b=question,
+                                label='0',
+                                meta=meta,
+                                idx=ex_idx,
+                                num_choices=len(candidates) + 1)
+                            examples.append(example)
+
+                    else:
+                        # create just one example with *all* correct answers and *all* answer candidates
+                        guid = f'{set_type}-p{idx}-q{question_idx}'
+                        meta = {
+                            'passage_idx': idx,
+                            'question_idx': question_idx,
+                            'candidates': entities,
+                            'answers': answers
+                        }
+                        example = InputExample(
+                            guid=guid,
+                            text_a=text,
+                            text_b=question,
+                            label='1',
+                            meta=meta,
+                            idx=question_idx,
+                            num_choices=len(entities))
+                        examples.append(example)
+
+        question_indices = list(
+            set(example.meta['question_idx'] for example in examples))
+        label_distribution = Counter(example.label for example in examples)
+        print_rank_0(
+            f'Returning {len(examples)} examples corresponding to {len(question_indices)} questions with label '
+            f'distribution {list(label_distribution.items())}')
+        return examples
+
+
+class MnliProcessor(DataProcessor):
+    """Processor for the MultiNLI data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        return self._create_examples(
+            os.path.join(data_dir, 'train.tsv'), 'train')
+
+    def get_dev_examples(self, data_dir, for_train=False):
+        return self._create_examples(
+            os.path.join(data_dir, 'dev_matched.tsv'), 'dev_matched')
+
+    def get_test_examples(self, data_dir) -> List[InputExample]:
+        return self._create_examples(
+            os.path.join(data_dir, 'test_matched.tsv'), 'test_matched')
+
+    def get_unlabeled_examples(self, data_dir) -> List[InputExample]:
+        return self.get_train_examples(data_dir)
+
+    def get_labels(self):
+        return ['contradiction', 'entailment', 'neutral']
+
+    @staticmethod
+    def _create_examples(path: str, set_type: str) -> List[InputExample]:
+        examples = []
+        df = read_tsv(path)
+
+        for idx, row in df.iterrows():
+            guid = f'{set_type}-{idx}'
+            text_a = punctuation_standardization(row['sentence1'])
+            text_b = punctuation_standardization(row['sentence2'])
+            label = row.get('gold_label', None)
+            example = InputExample(
+                guid=guid, text_a=text_a, text_b=text_b, label=label)
+            examples.append(example)
+
+        return examples
+
+
+class MnliMismatchedProcessor(MnliProcessor):
+    """Processor for the MultiNLI mismatched data set (GLUE version)."""
+
+    def get_dev_examples(self, data_dir, for_train=False):
+        return self._create_examples(
+            os.path.join(data_dir, 'dev_mismatched.tsv'), 'dev_mismatched')
+
+    def get_test_examples(self, data_dir) -> List[InputExample]:
+        return self._create_examples(
+            os.path.join(data_dir, 'test_mismatched.tsv'), 'test_mismatched')
+
+
+class AgnewsProcessor(DataProcessor):
+    """Processor for the AG news data set."""
+
+    def get_train_examples(self, data_dir):
+        return self._create_examples(
+            os.path.join(data_dir, 'train.csv'), 'train')
+
+    def get_dev_examples(self, data_dir, for_train=False):
+        return self._create_examples(os.path.join(data_dir, 'test.csv'), 'dev')
+
+    def get_test_examples(self, data_dir) -> List[InputExample]:
+        raise NotImplementedError()
+
+    def get_unlabeled_examples(self, data_dir) -> List[InputExample]:
+        return self.get_train_examples(data_dir)
+
+    def get_labels(self):
+        return ['1', '2', '3', '4']
+
+    @staticmethod
+    def _create_examples(path: str, set_type: str) -> List[InputExample]:
+        examples = []
+
+        with open(path) as f:
+            reader = csv.reader(f, delimiter=',')
+            for idx, row in enumerate(reader):
+                label, headline, body = row
+                guid = '%s-%s' % (set_type, idx)
+                text_a = punctuation_standardization(
+                    headline.replace('\\', ' '))
+                text_b = punctuation_standardization(body.replace('\\', ' '))
+
+                example = InputExample(
+                    guid=guid, text_a=text_a, text_b=text_b, label=label)
+                examples.append(example)
+
+        return examples
+
+
+class YahooAnswersProcessor(DataProcessor):
+    """Processor for the Yahoo Answers data set."""
+
+    def get_train_examples(self, data_dir):
+        return self._create_examples(
+            os.path.join(data_dir, 'train.csv'), 'train')
+
+    def get_dev_examples(self, data_dir, for_train=False):
+        return self._create_examples(os.path.join(data_dir, 'test.csv'), 'dev')
+
+    def get_test_examples(self, data_dir) -> List[InputExample]:
+        raise NotImplementedError()
+
+    def get_unlabeled_examples(self, data_dir) -> List[InputExample]:
+        return self.get_train_examples(data_dir)
+
+    def get_labels(self):
+        return ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']
+
+    @staticmethod
+    def _create_examples(path: str, set_type: str) -> List[InputExample]:
+        examples = []
+
+        with open(path, encoding='utf8') as f:
+            reader = csv.reader(f, delimiter=',')
+            for idx, row in enumerate(reader):
+                label, question_title, question_body, answer = row
+                guid = '%s-%s' % (set_type, idx)
+                text_a = ' '.join([
+                    question_title.replace('\\n', ' ').replace('\\', ' '),
+                    question_body.replace('\\n', ' ').replace('\\', ' ')
+                ])
+                text_a = punctuation_standardization(text_a)
+                text_b = answer.replace('\\n', ' ').replace('\\', ' ')
+                text_b = punctuation_standardization(text_b)
+
+                example = InputExample(
+                    guid=guid, text_a=text_a, text_b=text_b, label=label)
+                examples.append(example)
+
+        return examples
+
+
+class YelpPolarityProcessor(DataProcessor):
+    """Processor for the YELP binary classification set."""
+
+    def get_train_examples(self, data_dir):
+        return self._create_examples(
+            os.path.join(data_dir, 'train.csv'), 'train')
+
+    def get_dev_examples(self, data_dir, for_train=False):
+        return self._create_examples(os.path.join(data_dir, 'test.csv'), 'dev')
+
+    def get_test_examples(self, data_dir) -> List[InputExample]:
+        raise NotImplementedError()
+
+    def get_unlabeled_examples(self, data_dir) -> List[InputExample]:
+        return self.get_train_examples(data_dir)
+
+    def get_labels(self):
+        return ['1', '2']
+
+    @staticmethod
+    def _create_examples(path: str, set_type: str) -> List[InputExample]:
+        examples = []
+
+        with open(path) as f:
+            reader = csv.reader(f, delimiter=',')
+            for idx, row in enumerate(reader):
+                label, body = row
+                guid = '%s-%s' % (set_type, idx)
+                text_a = body.replace('\\n', ' ').replace('\\', ' ')
+                text_a = punctuation_standardization(text_a)
+
+                example = InputExample(guid=guid, text_a=text_a, label=label)
+                examples.append(example)
+
+        return examples
+
+
+class YelpFullProcessor(YelpPolarityProcessor):
+    """Processor for the YELP full classification set."""
+
+    def get_test_examples(self, data_dir) -> List[InputExample]:
+        raise NotImplementedError()
+
+    def get_labels(self):
+        return ['1', '2', '3', '4', '5']
+
+
+class XStanceProcessor(DataProcessor):
+    """Processor for the X-Stance data set."""
+
+    def __init__(self, args, language: str = None):
+        super().__init__(args)
+        if language is not None:
+            assert language in ['de', 'fr']
+        self.language = language
+
+    def get_train_examples(self, data_dir):
+        return self._create_examples(os.path.join(data_dir, 'train.jsonl'))
+
+    def get_dev_examples(self, data_dir, for_train=False):
+        return self._create_examples(os.path.join(data_dir, 'test.jsonl'))
+
+    def get_test_examples(self, data_dir) -> List[InputExample]:
+        raise NotImplementedError()
+
+    def get_unlabeled_examples(self, data_dir) -> List[InputExample]:
+        return self.get_train_examples(data_dir)
+
+    def get_labels(self):
+        return ['FAVOR', 'AGAINST']
+
+    def _create_examples(self, path: str) -> List[InputExample]:
+        examples = []
+
+        with open(path, encoding='utf8') as f:
+            for line in f:
+                example_json = json.loads(line)
+                label = example_json['label']
+                id_ = example_json['id']
+                text_a = punctuation_standardization(example_json['question'])
+                text_b = punctuation_standardization(example_json['comment'])
+                language = example_json['language']
+
+                if self.language is not None and language != self.language:
+                    continue
+
+                example = InputExample(
+                    guid=id_, text_a=text_a, text_b=text_b, label=label)
+                examples.append(example)
+
+        return examples
+
+
+class Sst2Processor(DataProcessor):
+
+    def get_train_examples(self, data_dir):
+        return self._create_examples(
+            os.path.join(data_dir, 'train.tsv'), 'train')
+
+    def get_dev_examples(self, data_dir, for_train=False):
+        return self._create_examples(os.path.join(data_dir, 'dev.tsv'), 'dev')
+
+    def get_test_examples(self, data_dir) -> List[InputExample]:
+        return self._create_examples(
+            os.path.join(data_dir, 'test.tsv'), 'test')
+
+    def get_labels(self):
+        return ['0', '1']
+
+    @staticmethod
+    def _create_examples(path: str, set_type: str) -> List[InputExample]:
+        examples = []
+        df = read_tsv(path)
+
+        for idx, row in df.iterrows():
+            guid = f'{set_type}-{idx}'
+            text_a = punctuation_standardization(row['sentence'])
+            label = row.get('label', None)
+            example = InputExample(guid=guid, text_a=text_a, label=label)
+            examples.append(example)
+
+        return examples
+
+
+class ColaProcessor(Sst2Processor):
+
+    def get_labels(self):
+        return ['0', '1']
+
+    @staticmethod
+    def _create_examples(path: str, set_type: str) -> List[InputExample]:
+        examples = []
+        if set_type != 'test':
+            df = read_tsv(path, header=None)
+        else:
+            df = read_tsv(path)
+
+        for idx, row in df.iterrows():
+            guid = f'{set_type}-{idx}'
+            if set_type != 'test':
+                text_a = punctuation_standardization(row[3])
+                label = row[1]
+            else:
+                text_a = punctuation_standardization(row['sentence'])
+                label = None
+            example = InputExample(guid=guid, text_a=text_a, label=label)
+            examples.append(example)
+
+        return examples
+
+
+class MrpcProcessor(Sst2Processor):
+
+    def get_labels(self):
+        return ['0', '1']
+
+    @staticmethod
+    def _create_examples(path: str, set_type: str) -> List[InputExample]:
+        examples = []
+        df = read_tsv(path)
+
+        for idx, row in df.iterrows():
+            guid = f'{set_type}-{idx}'
+            text_a = punctuation_standardization(row['#1 String'])
+            text_b = punctuation_standardization(row['#2 String'])
+            label = row.get('Quality', None)
+            example = InputExample(
+                guid=guid, text_a=text_a, text_b=text_b, label=label)
+            examples.append(example)
+
+        return examples
+
+
+class QqpProcessor(Sst2Processor):
+
+    def get_labels(self):
+        return ['0', '1']
+
+    @staticmethod
+    def _create_examples(path: str, set_type: str) -> List[InputExample]:
+        examples = []
+        df = read_tsv(path)
+
+        for idx, row in df.iterrows():
+            guid = f'{set_type}-{idx}'
+            text_a = punctuation_standardization(row['question1'])
+            text_b = punctuation_standardization(row['question2'])
+            label = row.get('is_duplicate', None)
+            example = InputExample(
+                guid=guid, text_a=text_a, text_b=text_b, label=label)
+            examples.append(example)
+
+        return examples
+
+
+class QnliProcessor(Sst2Processor):
+
+    def get_labels(self):
+        return ['entailment', 'not_entailment']
+
+    @staticmethod
+    def _create_examples(path: str, set_type: str) -> List[InputExample]:
+        examples = []
+        df = read_tsv(path)
+
+        for idx, row in df.iterrows():
+            guid = f'{set_type}-{idx}'
+            text_a = punctuation_standardization(row['question'])
+            text_b = punctuation_standardization(row['sentence'])
+            label = row.get('label', None)
+            example = InputExample(
+                guid=guid, text_a=text_a, text_b=text_b, label=label)
+            examples.append(example)
+
+        return examples
+
+
+class SquadProcessor(DataProcessor):
+
+    def get_train_examples(self, data_dir):
+        return self._create_examples(
+            os.path.join(data_dir, 'train-v2.0.json'), 'train')
+
+    def get_dev_examples(self, data_dir, for_train=False):
+        return self._create_examples(
+            os.path.join(data_dir, 'dev-v2.0.json'), 'dev')
+
+    def get_labels(self):
+        return ['0']
+
+    @staticmethod
+    def _create_examples(path: str, set_type: str) -> List[InputExample]:
+        examples = []
+        with open(path) as f:
+            data = json.load(f)['data']
+
+        for idx, passage in enumerate(data):
+            for pid, paragraph in enumerate(passage['paragraphs']):
+                context = paragraph['context']
+                for qid, qas in enumerate(paragraph['qas']):
+                    if len(qas['answers']) == 0:
+                        continue
+                    guid = f'{set_type}-{idx}-{pid}-{qid}'
+                    example = InputExample(
+                        guid=guid,
+                        text_a=context,
+                        text_b=qas['question'],
+                        label='0',
+                        meta={'answer': qas['answers'][0]})
+                    examples.append(example)
+
+        return examples
+
+
+CLASSIFICATION_DATASETS = {'wic', 'rte', 'cb', 'boolq', 'multirc', 'wsc'}
+MULTI_CHOICE_DATASETS = {'copa', 'record'}
+
+PROCESSORS = {
+    'mnli': MnliProcessor,
+    'mnli-mm': MnliMismatchedProcessor,
+    'agnews': AgnewsProcessor,
+    'yahoo': YahooAnswersProcessor,
+    'yelp-polarity': YelpPolarityProcessor,
+    'yelp-full': YelpFullProcessor,
+    'xstance-de': lambda: XStanceProcessor('de'),
+    'xstance-fr': lambda: XStanceProcessor('fr'),
+    'xstance': XStanceProcessor,
+    'wic': WicProcessor,
+    'rte': RteProcessor,
+    'cb': CbProcessor,
+    'wsc': WscProcessor,
+    'wsc1': WscProcessor,
+    'boolq': BoolQProcessor,
+    'copa': CopaProcessor,
+    'multirc': MultiRcProcessor,
+    'record': RecordProcessor,
+    'ax-g': AxGProcessor,
+    'ax-b': AxBProcessor,
+    'sst2': Sst2Processor,
+    'cola': ColaProcessor,
+    'mrpc': MrpcProcessor,
+    'qqp': QqpProcessor,
+    'qnli': QnliProcessor,
+    'squad': SquadProcessor,
+    'race': RaceProcessor,
+    'squad': SquadProcessor
+}  # type: Dict[str,Callable[[1],DataProcessor]]
diff --git a/modelscope/models/nlp/mglm/tasks/superglue/evaluate.py b/modelscope/models/nlp/mglm/tasks/superglue/evaluate.py
new file mode 100644
index 00000000..145fb45b
--- /dev/null
+++ b/modelscope/models/nlp/mglm/tasks/superglue/evaluate.py
@@ -0,0 +1,101 @@
+# Copyright (c) 2022 Zhipu.AI
+"""
+Official evaluation script for ReCoRD v1.0.
+(Some functions are adopted from the SQuAD evaluation script.)
+"""
+
+from __future__ import print_function
+import functools
+import re
+import string
+from collections import Counter, defaultdict
+from typing import List
+
+from tasks.data_utils import InputExample
+
+
+def normalize_answer(s):
+    """Lower text and remove punctuation, articles and extra whitespace."""
+
+    def remove_articles(text):
+        return re.sub(r'\b(a|an|the)\b', ' ', text)
+
+    def white_space_fix(text):
+        return ' '.join(text.split())
+
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return ''.join(ch for ch in text if ch not in exclude)
+
+    def lower(text):
+        return text.lower()
+
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+
+def f1_score(prediction, ground_truth):
+    prediction_tokens = normalize_answer(prediction).split()
+    ground_truth_tokens = normalize_answer(ground_truth).split()
+    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
+    num_same = sum(common.values())
+    if num_same == 0:
+        return 0
+    precision = 1.0 * num_same / len(prediction_tokens)
+    recall = 1.0 * num_same / len(ground_truth_tokens)
+    f1 = (2 * precision * recall) / (precision + recall)
+    return f1
+
+
+def exact_match_score(prediction, ground_truth):
+    return normalize_answer(prediction) == normalize_answer(ground_truth)
+
+
+def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
+    if not ground_truths:
+        return 0.0
+    scores_for_ground_truths = []
+    for ground_truth in ground_truths:
+        score = metric_fn(prediction, ground_truth)
+        scores_for_ground_truths.append(score)
+    return max(scores_for_ground_truths)
+
+
+def qa_evaluate(predictions, labels, examples: List[InputExample], metric):
+    assert len(examples) == len(predictions)
+    score = 0.0
+    for example, prediction in zip(examples, predictions):
+        ground_truths = example.meta['answers']
+        prediction = example.meta['candidates'][prediction]
+        if ground_truths:
+            score += metric_max_over_ground_truths(metric, prediction,
+                                                   ground_truths)
+    score = 100.0 * score / len(predictions)
+    return score
+
+
+def multirc_em(predictions, labels, examples: List[InputExample]):
+    """Compute the exact match (EM) for a sequence of predictions and actual labels"""
+    question_ids = [example.meta['question_idx'] for example in examples]
+    unique_questions = set(question_ids)
+
+    q_actuals = list(zip(question_ids, labels))
+    q_predictions = list(zip(question_ids, predictions))
+
+    actuals_per_question = defaultdict(list)
+    predictions_per_question = defaultdict(list)
+
+    for qid, val in q_actuals:
+        actuals_per_question[qid].append(val)
+    for qid, val in q_predictions:
+        predictions_per_question[qid].append(val)
+
+    em = 0
+    for qid in unique_questions:
+        if actuals_per_question[qid] == predictions_per_question[qid]:
+            em += 1
+    em /= len(unique_questions)
+    return em
+
+
+qa_exact_match = functools.partial(qa_evaluate, metric=exact_match_score)
+qa_f1 = functools.partial(qa_evaluate, metric=f1_score)
diff --git a/modelscope/models/nlp/mglm/tasks/superglue/finetune.py b/modelscope/models/nlp/mglm/tasks/superglue/finetune.py
new file mode 100644
index 00000000..371705ff
--- /dev/null
+++ b/modelscope/models/nlp/mglm/tasks/superglue/finetune.py
@@ -0,0 +1,138 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Race."""
+
+from collections import OrderedDict
+
+from finetune_glm import finetune
+from tasks.eval_utils import (accuracy_func_provider, accuracy_metric,
+                              f1_macro_metric, f1_metric)
+from tasks.superglue.dataset import (CLASSIFICATION_DATASETS,
+                                     MULTI_CHOICE_DATASETS, PROCESSORS,
+                                     SuperGlueDataset, get_output_func)
+from tasks.superglue.evaluate import multirc_em, qa_exact_match, qa_f1
+from tasks.superglue.pvp import PVPS
+
+DEFAULT_METRICS = {
+    'record': [('EM', qa_exact_match), ('F1', qa_f1)],
+    'copa': [('accuracy', accuracy_metric)],
+    'rte': [('accuracy', accuracy_metric)],
+    'boolq': [('accuracy', accuracy_metric)],
+    'wic': [('accuracy', accuracy_metric)],
+    'wsc': [('accuracy', accuracy_metric)],
+    'cb': [('accuracy', accuracy_metric), ('f1-macro', f1_macro_metric)],
+    'multirc': [('f1a', f1_metric), ('em', multirc_em),
+                ('acc', accuracy_metric)],
+    'mnli': [('accuracy', accuracy_metric)],
+    'sst2': [('accuracy', accuracy_metric)],
+    'qnli': [('accuracy', accuracy_metric)],
+    'qqp': [('accuracy', accuracy_metric)],
+    'mrpc': [('accuracy', accuracy_metric)],
+    'cola': [('accuracy', accuracy_metric)],
+    'squad': [('accuracy', accuracy_metric)],
+}
+
+
+def train_valid_datasets_provider(args, tokenizer, pattern_text=False):
+    """Provide train and validation datasets."""
+    task_name = args.task.lower()
+    data_dir = args.data_dir
+    train_dataset = SuperGlueDataset(
+        args,
+        task_name,
+        data_dir,
+        args.seq_length,
+        'train',
+        tokenizer,
+        pattern_text=pattern_text)
+    valid_dataset = SuperGlueDataset(
+        args,
+        task_name,
+        data_dir,
+        args.seq_length,
+        'dev',
+        tokenizer,
+        for_train=True,
+        pattern_text=pattern_text)
+
+    return train_dataset, valid_dataset
+
+
+def metrics_func_provider(args, tokenizer, is_test):
+    """Privde metrics callback function."""
+
+    def single_dataset_provider(split):
+        return SuperGlueDataset(args, args.task.lower(), args.data_dir,
+                                args.seq_length, split, tokenizer)
+
+    output_func = get_output_func(args.task.lower(), args)
+    eval_func = None
+    if args.task.lower() in ['wsc', 'squad'
+                             ] and args.cloze_eval and not args.wsc_negative:
+        from tasks.language_model.finetune import classify_evaluate
+        eval_func = classify_evaluate
+    metric_dict = OrderedDict(DEFAULT_METRICS[args.task.lower()])
+    return accuracy_func_provider(
+        single_dataset_provider,
+        metric_dict,
+        args,
+        is_test=is_test,
+        eval_func=eval_func,
+        output_func=output_func,
+        only_rank0=False,
+        tokenizer=tokenizer)
+
+
+def main(args):
+    model_kwargs = {}
+    processor = PROCESSORS[args.task.lower()](args)
+    pvp = PVPS[args.task.lower()](
+        args,
+        None,
+        processor.get_labels(),
+        args.seq_length,
+        pattern_id=args.pattern_id,
+        is_multi_token=args.multi_token,
+        num_prompt_tokens=args.num_prompt_tokens)
+    if args.continuous_prompt:
+        model_kwargs['spell_length'] = pvp.spell_length
+    if args.task.lower() in ['wsc', 'squad'
+                             ] and args.cloze_eval and not args.wsc_negative:
+        from tasks.language_model.finetune import lm_forward_step
+        finetune(
+            args,
+            train_valid_datasets_provider,
+            model_kwargs,
+            end_of_epoch_callback_provider=metrics_func_provider,
+            forward_step=lm_forward_step)
+    else:
+        if args.cloze_eval:
+            multi_token = pvp.is_multi_token
+        else:
+            multi_token = args.task.lower() in MULTI_CHOICE_DATASETS
+        args.multi_token = multi_token
+        if not multi_token:
+            model_kwargs[
+                'model_type'] = 'multiple_choice' if args.cloze_eval else 'classification'
+            model_kwargs['multi_token'] = False
+            model_kwargs['num_labels'] = len(processor.get_labels())
+        else:
+            model_kwargs['model_type'] = 'multiple_choice'
+            model_kwargs['multi_token'] = True
+            model_kwargs['num_labels'] = 1
+        finetune(
+            args,
+            train_valid_datasets_provider,
+            model_kwargs,
+            end_of_epoch_callback_provider=metrics_func_provider)
diff --git a/modelscope/models/nlp/mglm/tasks/superglue/pvp.py b/modelscope/models/nlp/mglm/tasks/superglue/pvp.py
new file mode 100644
index 00000000..ff394172
--- /dev/null
+++ b/modelscope/models/nlp/mglm/tasks/superglue/pvp.py
@@ -0,0 +1,1541 @@
+# Copyright (c) 2022 Zhipu.AI
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This file contains the pattern-verbalizer pairs (PVPs) for all tasks.
+"""
+import copy
+import math
+import random
+import string
+from abc import ABC, abstractmethod
+from collections import defaultdict
+from typing import Dict, List, Tuple, Union
+
+import numpy as np
+from tasks.data_utils import (InputExample, build_decoder_input,
+                              build_decoder_sample, build_input_from_ids,
+                              build_sample, num_special_tokens_to_add)
+from utils import print_rank_0
+
+FilledPattern = Tuple[List[Union[str, Tuple[str, bool]]],
+                      List[Union[str, Tuple[str, bool]]]]
+
+
+class PVP(ABC):
+    """
+    This class contains functions to apply patterns and verbalizers as required by PET. Each task requires its own
+    custom implementation of a PVP.
+    """
+
+    def __init__(self,
+                 args,
+                 tokenizer,
+                 label_list,
+                 max_seq_length,
+                 pattern_id: int = 0,
+                 verbalizer_file: str = None,
+                 seed: int = 42,
+                 is_multi_token=False,
+                 max_segment_length=0,
+                 fast_decode: bool = False,
+                 split='train',
+                 num_prompt_tokens=0):
+        """
+        Create a new PVP.
+
+        :param args: the args
+        :param tokenizer: the tokenizer
+        :param label_list: the list of labels
+        :param max_seq_length: the maximum length of the sequence
+        :param pattern_id: the pattern id to use
+        :param seed: a seed to be used for generating random numbers if necessary
+        :param is_multi_token: if the verbalizers contain multiple tokens
+        :param fast_decode: whether to use the fast decode mode for multi-token tasks
+        :param continuous_prompt: whether to use continuous prompt optimization
+        """
+        self.args = args
+        self.tokenizer = tokenizer
+        self.label_list = label_list
+        self.max_seq_length = max_seq_length
+        self.pattern_id = pattern_id
+        self.num_prompt_tokens = num_prompt_tokens
+        self.rng = random.Random(seed)
+        self.num_truncated = 0
+        self.fast_decode = fast_decode
+        self.split = split
+        self.max_dec_seq_length = 16
+        self._is_multi_token = is_multi_token
+        self.max_segment_length = max_segment_length
+        self.task_mask = args.task_mask
+        self.continuous_prompt = args.continuous_prompt
+        self.prefix_prompt = args.prefix_prompt
+        if self.continuous_prompt:
+            print_rank_0(
+                f'Prompt tokens in pvp {self.num_prompt_tokens} spell length {self.spell_length}'
+            )
+
+        if verbalizer_file:
+            self.verbalize = PVP._load_verbalizer_from_file(
+                verbalizer_file, self.pattern_id)
+
+    @property
+    def is_multi_token(self):
+        return self._is_multi_token
+
+    @property
+    def spell_length(self):
+        return 0
+
+    @property
+    def mask(self) -> str:
+        """Return the underlying LM's mask token"""
+        return self.tokenizer.get_command('MASK').Id
+
+    @property
+    def mask_id(self) -> int:
+        """Return the underlying LM's mask id"""
+        return self.tokenizer.get_command('MASK').Id
+
+    @property
+    def max_num_verbalizers(self) -> int:
+        """Return the maximum number of verbalizers across all labels"""
+        return max(len(self.verbalize(label)) for label in self.label_list)
+
+    @staticmethod
+    def shortenable(s):
+        """Return an instance of this string that is marked as shortenable"""
+        return s, True
+
+    @staticmethod
+    def remove_final_punc(s: Union[str, Tuple[str, bool]]):
+        """Remove the final punctuation mark"""
+        if isinstance(s, tuple):
+            return PVP.remove_final_punc(s[0]), s[1]
+        return s.rstrip(string.punctuation)
+
+    @staticmethod
+    def lowercase_first(s: Union[str, Tuple[str, bool]]):
+        """Lowercase the first character"""
+        if isinstance(s, tuple):
+            return PVP.lowercase_first(s[0]), s[1]
+        return s[0].lower() + s[1:]
+
+    @staticmethod
+    def uppercase_first(s: Union[str, Tuple[str, bool]]):
+        """Lowercase the first character"""
+        if isinstance(s, tuple):
+            return PVP.uppercase_first(s[0]), s[1]
+        return s[0].upper() + s[1:]
+
+    @staticmethod
+    def available_patterns():
+        return [0]
+
+    def replace_prompt_tokens(self, parts_a, parts_b):
+        if not self.continuous_prompt:
+            parts_a = [part for part in parts_a if part is not None]
+            parts_b = [part for part in parts_b if part is not None]
+            return parts_a, parts_b
+        num_prompt_tokens = self.num_prompt_tokens
+        num_pos = 0
+        for parts in (parts_a, parts_b):
+            for part in parts:
+                if part is None:
+                    num_pos += 1
+        avg_prompt_tokens = math.ceil(num_prompt_tokens / num_pos)
+        new_parts_a, new_parts_b = [], []
+        for part in parts_a:
+            if part is None:
+                if num_prompt_tokens > 0:
+                    if num_prompt_tokens >= avg_prompt_tokens:
+                        new_parts_a.append(avg_prompt_tokens)
+                        num_prompt_tokens -= avg_prompt_tokens
+                    else:
+                        new_parts_a.append(num_prompt_tokens)
+                        num_prompt_tokens = 0
+            else:
+                new_parts_a.append(part)
+        for part in parts_b:
+            if part is None:
+                if num_prompt_tokens > 0:
+                    if num_prompt_tokens >= avg_prompt_tokens:
+                        new_parts_b.append(avg_prompt_tokens)
+                        num_prompt_tokens -= avg_prompt_tokens
+                    else:
+                        new_parts_b.append(num_prompt_tokens)
+                        num_prompt_tokens = 0
+            else:
+                new_parts_b.append(part)
+        return new_parts_a, new_parts_b
+
+    def encode(self,
+               example: InputExample,
+               priming: bool = False,
+               labeled: bool = False):
+        """
+        Encode an input example using this pattern-verbalizer pair.
+
+        :param example: the input example to encode
+        :param priming: whether to use this example for priming
+        :param labeled: if ``priming=True``, whether the label should be appended to this example
+        :return: A tuple, consisting of a list of input ids and a list of token type ids
+        """
+
+        if not priming:
+            assert not labeled, "'labeled' can only be set to true if 'priming' is also set to true"
+
+        tokenizer = self.tokenizer
+        raw_parts_a, raw_parts_b = self.get_parts(example)
+
+        raw_parts_a = [
+            x if isinstance(x, tuple) else (x, False) for x in raw_parts_a
+        ]
+        prompt_id = tokenizer.num_tokens
+
+        def encode_input(raw_parts):
+            parts = []
+            for x, s in raw_parts:
+                if isinstance(x, str):
+                    x = tokenizer.EncodeAsIds(x)
+                elif isinstance(x, int):
+                    x = [prompt_id] * x
+                else:
+                    pass
+                parts.append((x, s))
+            return parts
+
+        parts_a = encode_input(raw_parts_a)
+        if self.prefix_prompt > 0:
+            parts_a = [([prompt_id] * self.prefix_prompt, False)] + parts_a
+
+        parts_b = None
+        if raw_parts_b:
+            raw_parts_b = [
+                x if isinstance(x, tuple) else (x, False) for x in raw_parts_b
+            ]
+            parts_b = encode_input(raw_parts_b)
+
+        if self.is_multi_token:
+            answers = self.get_answers(example)
+            if example.label is not None:
+                label = self.label_list.index(example.label)
+            else:
+                label = 0
+
+            if not self.fast_decode:
+                ids_list, positions_list, sep_list, mask_list, target_list, prompt_list = [], [], [], [], [], []
+                segment_id_list = []
+                if priming:
+                    answer = answers[label]
+                    answer_ids = get_verbalization_ids(
+                        answer, tokenizer, force_single_token=False)
+                    self.num_truncated += self.truncate(
+                        parts_a,
+                        parts_b,
+                        answer_ids,
+                        max_length=self.max_seq_length)
+                    tokens_a = [
+                        token_id for part, _ in parts_a for token_id in part
+                    ]
+                    tokens_b = [
+                        token_id for part, _ in parts_b for token_id in part
+                    ] if parts_b else None
+                    input_ids = tokens_a
+                    if tokens_b:
+                        input_ids += tokens_b
+                    if labeled:
+                        mask_idx = input_ids.index(self.mask_id)
+                        input_ids = input_ids[:
+                                              mask_idx] + answer_ids + input_ids[
+                                                  mask_idx + 1:]
+                    return input_ids
+                else:
+                    for idx, answer in enumerate(answers):
+                        this_parts_a, this_parts_b = copy.deepcopy(
+                            parts_a), copy.deepcopy(parts_b)
+                        answer_ids = get_verbalization_ids(
+                            answer, tokenizer, force_single_token=False)
+                        answer_ids = answer_ids + [
+                            tokenizer.get_command('eop').Id
+                        ]
+                        self.num_truncated += self.truncate(
+                            this_parts_a,
+                            this_parts_b,
+                            answer_ids,
+                            max_length=self.max_seq_length)
+                        tokens_a = [
+                            token_id for part, _ in this_parts_a
+                            for token_id in part
+                        ]
+                        tokens_b = [
+                            token_id for part, _ in this_parts_b
+                            for token_id in part
+                        ] if parts_b else None
+                        if self.max_segment_length > 0:
+                            num_segments = (len(answer_ids)
+                                            - 1) // self.max_segment_length + 1
+                            segments = [
+                                answer_ids[index
+                                           * self.max_segment_length:(index
+                                                                      + 1)
+                                           * self.max_segment_length]
+                                for index in range(num_segments)
+                            ]
+                            segment_id_list += [idx] * len(segments)
+                        else:
+                            segments = [answer_ids]
+                        for segment in segments:
+                            data = build_input_from_ids(
+                                tokens_a,
+                                tokens_b,
+                                segment,
+                                self.max_seq_length,
+                                self.tokenizer,
+                                args=self.args,
+                                add_cls=True,
+                                add_sep=False,
+                                add_piece=True,
+                                mask_id=self.mask_id)
+                            ids, types, paddings, position_ids, sep, target_ids, loss_masks = data
+                            prompt_pos = [
+                                idx for idx, token in enumerate(ids)
+                                if token == prompt_id
+                            ]
+                            ids = [
+                                idx if idx != prompt_id else 0 for idx in ids
+                            ]
+                            prompt_list.append(prompt_pos)
+                            ids_list.append(ids)
+                            positions_list.append(position_ids)
+                            sep_list.append(sep)
+                            target_list.append(target_ids)
+                            mask_list.append(loss_masks)
+                            if self.mask in tokens_a:
+                                mask_pos = tokens_a.index(self.mask)
+                                tokens_a = tokens_a[:
+                                                    mask_pos] + segment + tokens_a[
+                                                        mask_pos:]
+                            else:
+                                mask_pos = tokens_b.index(self.mask)
+                                tokens_b = tokens_b[:
+                                                    mask_pos] + segment + tokens_b[
+                                                        mask_pos:]
+                    segment_id_list = segment_id_list if segment_id_list else None
+                    sample = build_sample(
+                        ids_list,
+                        positions=positions_list,
+                        masks=sep_list,
+                        label=label,
+                        logit_mask=mask_list,
+                        target=target_list,
+                        unique_id=example.guid,
+                        segment_ids=segment_id_list,
+                        prompt_ids=prompt_list)
+                    return sample
+            else:
+                this_parts_a, this_parts_b = copy.deepcopy(
+                    parts_a), copy.deepcopy(parts_b)
+                self.num_truncated += self.truncate(
+                    this_parts_a,
+                    this_parts_b,
+                    None,
+                    max_length=self.max_seq_length)
+                tokens_a = [
+                    token_id for part, _ in this_parts_a for token_id in part
+                ]
+                tokens_b = [
+                    token_id for part, _ in this_parts_b for token_id in part
+                ] if parts_b else None
+                data = build_input_from_ids(
+                    tokens_a,
+                    tokens_b,
+                    None,
+                    self.max_seq_length,
+                    self.tokenizer,
+                    args=self.args,
+                    add_cls=True,
+                    add_sep=False,
+                    add_piece=False)
+                ids, types, paddings, position_ids, sep, target_ids, loss_masks = data
+                sample = build_sample(
+                    ids,
+                    positions=position_ids,
+                    masks=sep,
+                    label=label,
+                    unique_id=example.guid)
+
+                ids_list, positions_list, mask_list, target_list, logit_mask_list = [], [], [], [], []
+                for answer in answers:
+                    answer_ids = get_verbalization_ids(
+                        answer, tokenizer, force_single_token=False)
+                    answer_ids = answer_ids + [tokenizer.get_command('eop').Id]
+                    answer_ids = answer_ids[:self.max_dec_seq_length]
+                    data = build_decoder_input(ids, answer_ids,
+                                               self.max_seq_length,
+                                               self.max_dec_seq_length,
+                                               tokenizer)
+                    dec_ids, _, _, dec_position_ids, _, dec_target_ids, dec_loss_masks = data
+                    ids_list.append(dec_ids)
+                    positions_list.append(dec_position_ids)
+                    mask_list.append(sep)
+                    target_list.append(dec_target_ids)
+                    logit_mask_list.append(dec_loss_masks)
+
+                sample = build_decoder_sample(sample, ids_list, positions_list,
+                                              mask_list, target_list,
+                                              logit_mask_list)
+                return sample
+
+        else:
+            self.num_truncated += self.truncate(
+                parts_a, parts_b, [], max_length=self.max_seq_length)
+
+            tokens_a = [token_id for part, _ in parts_a for token_id in part]
+            tokens_b = [token_id for part, _ in parts_b
+                        for token_id in part] if parts_b else None
+            if priming:
+                input_ids = tokens_a
+                if tokens_b:
+                    input_ids += tokens_b
+                if labeled:
+                    mask_idx = input_ids.index(self.mask_id)
+                    verbalizer = self.verbalize(example.label)
+                    assert len(
+                        verbalizer
+                    ) == 1, 'priming only supports one verbalization per label'
+                    verbalizer = verbalizer[0]
+                    verbalizer_id = get_verbalization_ids(
+                        verbalizer, self.tokenizer, force_single_token=True)
+                    input_ids[mask_idx] = verbalizer_id
+                return input_ids
+            data = build_input_from_ids(
+                tokens_a,
+                tokens_b,
+                None,
+                self.max_seq_length,
+                self.tokenizer,
+                args=self.args,
+                add_cls=True,
+                add_sep=False,
+                add_piece=True)
+            ids, types, paddings, position_ids, sep, target_ids, loss_masks = data
+            prompt_pos = [
+                idx for idx, token in enumerate(ids) if token == prompt_id
+            ]
+            ids = [token if token != prompt_id else 0 for token in ids]
+            target_ids = self.get_verbalizer_ids()
+            if example.label is not None:
+                label = self.label_list.index(example.label)
+            else:
+                label = 0
+            sample = build_sample(
+                ids=ids,
+                positions=position_ids,
+                target=target_ids,
+                masks=sep,
+                logit_mask=loss_masks,
+                label=label,
+                unique_id=example.guid,
+                prompt_ids=prompt_pos)
+            return sample
+
+    @staticmethod
+    def _seq_length(parts: List[Tuple[List[int], bool]],
+                    only_shortenable: bool = False):
+        return sum([
+            len(x) for x, shortenable in parts
+            if not only_shortenable or shortenable
+        ]) if parts else 0
+
+    @staticmethod
+    def _remove_last(parts: List[Tuple[List[int], bool]]):
+        last_idx = max(idx for idx, (seq, shortenable) in enumerate(parts)
+                       if shortenable and seq)
+        parts[last_idx] = (parts[last_idx][0][:-1], parts[last_idx][1])
+
+    def truncate(self, parts_a: List[Tuple[List[int], bool]],
+                 parts_b: List[Tuple[List[int], bool]], answer: List[int],
+                 max_length: int):
+        """Truncate two sequences of text to a predefined total maximum length"""
+        total_len = self._seq_length(parts_a) + self._seq_length(parts_b)
+        if answer:
+            total_len += len(answer)
+        total_len += num_special_tokens_to_add(
+            parts_a,
+            parts_b,
+            answer,
+            add_cls=True,
+            add_sep=False,
+            add_piece=True)
+        num_tokens_to_remove = total_len - max_length
+
+        if num_tokens_to_remove <= 0:
+            return False
+
+        for _ in range(num_tokens_to_remove):
+            if self._seq_length(
+                    parts_a, only_shortenable=True) > self._seq_length(
+                        parts_b, only_shortenable=True):
+                self._remove_last(parts_a)
+            else:
+                self._remove_last(parts_b)
+        return True
+
+    @abstractmethod
+    def get_parts(self, example: InputExample) -> FilledPattern:
+        """
+        Given an input example, apply a pattern to obtain two text sequences (text_a and text_b) containing exactly one
+        mask token (or one consecutive sequence of mask tokens for PET with multiple masks). If a task requires only a
+        single sequence of text, the second sequence should be an empty list.
+
+        :param example: the input example to process
+        :return: Two sequences of text. All text segments can optionally be marked as being shortenable.
+        """
+        pass
+
+    def get_answers(self, example: InputExample):
+        return [self.verbalize(label)[0] for label in self.label_list]
+
+    def get_verbalizer_ids(self):
+        target_ids = []
+        for label in self.label_list:
+            verbalizer = self.verbalize(label)[0]
+            verbalizer_id = get_verbalization_ids(
+                verbalizer, self.tokenizer, force_single_token=True)
+            target_ids.append(verbalizer_id)
+        return target_ids
+
+    @abstractmethod
+    def verbalize(self, label) -> List[str]:
+        """
+        Return all verbalizations for a given label.
+
+        :param label: the label
+        :return: the list of verbalizations
+        """
+        pass
+
+    def get_mask_positions(self, input_ids: List[int]) -> List[int]:
+        label_idx = input_ids.index(self.mask_id)
+        labels = [-1] * len(input_ids)
+        labels[label_idx] = 1
+        return labels
+
+    @staticmethod
+    def _load_verbalizer_from_file(path: str, pattern_id: int):
+
+        verbalizers = defaultdict(
+            dict)  # type: Dict[int, Dict[str, List[str]]]
+        current_pattern_id = None
+
+        with open(path, 'r') as fh:
+            for line in fh.read().splitlines():
+                if line.isdigit():
+                    current_pattern_id = int(line)
+                elif line:
+                    label, *realizations = line.split()
+                    verbalizers[current_pattern_id][label] = realizations
+
+        print_rank_0(
+            'Automatically loaded the following verbalizer: \n {}'.format(
+                verbalizers[pattern_id]))
+
+        def verbalize(label) -> List[str]:
+            return verbalizers[pattern_id][label]
+
+        return verbalize
+
+
+class CopaPVP(PVP):
+
+    @staticmethod
+    def available_patterns():
+        return [0, 1]
+
+    @property
+    def is_multi_token(self):
+        return True
+
+    @property
+    def spell_length(self):
+        return self.num_prompt_tokens + self.prefix_prompt
+
+    @property
+    def mask(self) -> str:
+        """Return the underlying LM's mask token"""
+        mask_token = 'MASK'
+        return self.tokenizer.get_command(mask_token).Id
+
+    @property
+    def mask_id(self) -> int:
+        """Return the underlying LM's mask id"""
+        mask_token = 'MASK'
+        return self.tokenizer.get_command(mask_token).Id
+
+    def get_answers(self, example: InputExample):
+        choice1 = ' ' + self.remove_final_punc(
+            self.lowercase_first(example.meta['choice1']))
+        choice2 = ' ' + self.remove_final_punc(
+            self.lowercase_first(example.meta['choice2']))
+        return [choice1, choice2]
+
+    def get_parts(self, example: InputExample) -> FilledPattern:
+        assert self.pattern_id in [0, 1, 2, 3]
+        premise = self.remove_final_punc(
+            self.shortenable(' ' + example.text_a))
+        choice1 = self.remove_final_punc(
+            self.lowercase_first(example.meta['choice1']))
+        choice2 = self.remove_final_punc(
+            self.lowercase_first(example.meta['choice2']))
+
+        question = example.meta['question']
+        assert question in ['cause', 'effect']
+        if question == 'cause':
+            joiner = ' because'
+        else:
+            joiner = ', so'
+        if self.pattern_id == 0:
+            parts_a, parts_b = [
+                None, '"', choice1, '" or "', choice2, '"?', None, premise,
+                joiner, None, [self.mask], '.'
+            ], []
+        elif self.pattern_id == 1:
+            parts_a, parts_b = [
+                None, choice1, ' or', ' ' + choice2, '?', None, premise,
+                joiner, None, [self.mask], '.'
+            ], []
+        elif self.pattern_id == 2:
+            parts_a, parts_b = [
+                None, '"', choice1, '" or "', choice2, '"', None, premise,
+                joiner, [self.mask], '.', None
+            ], []
+        else:
+            raise NotImplementedError(self.pattern_id)
+        parts_a, parts_b = self.replace_prompt_tokens(parts_a, parts_b)
+        return parts_a, parts_b
+
+    def verbalize(self, label) -> List[str]:
+        return []
+
+    def encode(self,
+               example: InputExample,
+               priming: bool = False,
+               labeled: bool = False):
+        """
+        Encode an input example using this pattern-verbalizer pair.
+
+        :param example: the input example to encode
+        :param priming: whether to use this example for priming
+        :param labeled: if ``priming=True``, whether the label should be appended to this example
+        :return: A tuple, consisting of a list of input ids and a list of token type ids
+        """
+        if self.continuous_prompt or self.pattern_id < 2:
+            return super().encode(example, priming=priming, labeled=labeled)
+        if not priming:
+            assert not labeled, "'labeled' can only be set to true if 'priming' is also set to true"
+
+        tokenizer = self.tokenizer
+        premise = self.remove_final_punc(self.shortenable(example.text_a))
+        choice1 = ' ' + self.remove_final_punc(
+            self.lowercase_first(example.meta['choice1']))
+        choice2 = ' ' + self.remove_final_punc(
+            self.lowercase_first(example.meta['choice2']))
+        question = example.meta['question']
+        assert question in ['cause', 'effect']
+        answer = ' because' if question == 'cause' else ' so'
+        answer_ids = [
+            get_verbalization_ids(answer, tokenizer, force_single_token=True)
+        ]
+        if self.is_multi_token:
+            answer_ids.append(tokenizer.get_command('eop').Id)
+
+        ids_list, positions_list, sep_list, mask_list, target_list = [], [], [], [], []
+
+        for choice in [choice1, choice2]:
+            parts = [
+                '"', choice1[1:], '" or "', choice2[1:], '"?', premise,
+                [self.mask], choice
+            ]
+            parts = [x if isinstance(x, tuple) else (x, False) for x in parts]
+            parts = [(tokenizer.EncodeAsIds(x).tokenization if isinstance(
+                x, str) else x, s) for x, s in parts if x]
+            self.num_truncated += self.truncate(
+                parts, None, answer_ids, max_length=self.max_seq_length)
+            tokens_a = [token_id for part, _ in parts for token_id in part]
+            data = build_input_from_ids(
+                tokens_a,
+                None,
+                answer_ids,
+                self.max_seq_length,
+                self.tokenizer,
+                args=self.args,
+                add_cls=True,
+                add_sep=False,
+                add_piece=True)
+            ids, types, paddings, position_ids, sep, target_ids, loss_masks = data
+            ids_list.append(ids)
+            positions_list.append(position_ids)
+            sep_list.append(sep)
+            target_list.append(target_ids)
+            mask_list.append(loss_masks)
+        if example.label is not None:
+            label = self.label_list.index(example.label)
+        else:
+            label = 0
+        sample = build_sample(
+            ids_list,
+            positions=positions_list,
+            masks=sep_list,
+            label=label,
+            logit_mask=mask_list,
+            target=target_list,
+            unique_id=example.guid)
+        return sample
+
+
+class WscPVP(PVP):
+
+    @staticmethod
+    def available_patterns():
+        return [0, 1, 2]
+
+    @property
+    def is_multi_token(self):
+        return True
+
+    @property
+    def spell_length(self):
+        return self.num_prompt_tokens + self.prefix_prompt
+
+    def get_answers(self, example: InputExample):
+        target = ' ' + example.meta['span1_text']
+        answers = [target]
+        if 'candidates' in example.meta:
+            candidates = example.meta['candidates']
+            # if len(candidates) > 10:
+            #     random.shuffle(candidates)
+            #     candidates = candidates[:10]
+            answers += [' ' + cand for cand in candidates]
+        return answers
+
+    def get_parts(self, example: InputExample) -> FilledPattern:
+        pronoun = example.meta['span2_text']
+        pronoun_idx = example.meta['span2_index']
+
+        words_a = example.text_a.split()
+        words_a[pronoun_idx] = '*' + words_a[pronoun_idx] + '*'
+        text_a = ' '.join(words_a)
+        text_a = self.shortenable(text_a)
+
+        if self.pattern_id == 0:
+            parts_a, parts_b = [
+                None, text_a,
+                None, " The pronoun '*" + pronoun + "*' refers to", None,
+                [self.mask], '.'
+            ], []
+        elif self.pattern_id == 1:
+            parts_a, parts_b = [
+                None, text_a, None, " In the previous sentence, the pronoun '*"
+                + pronoun + "*' refers to", None, [self.mask], '.'
+            ], []
+        elif self.pattern_id == 2:
+            parts_a, parts_b = [
+                None, text_a, None,
+                " Question: In the passage above, what does the pronoun '*"
+                + pronoun + "*' refer to?", None, ' Answer:', [self.mask], '.'
+            ], []
+        else:
+            raise NotImplementedError(self.pattern_id)
+        parts_a, parts_b = self.replace_prompt_tokens(parts_a, parts_b)
+        return parts_a, parts_b
+
+    def encode(self,
+               example: InputExample,
+               priming: bool = False,
+               labeled: bool = False):
+        """
+        Encode an input example using this pattern-verbalizer pair.
+
+        :param example: the input example to encode
+        :param priming: whether to use this example for priming
+        :param labeled: if ``priming=True``, whether the label should be appended to this example
+        :return: A tuple, consisting of a list of input ids and a list of token type ids
+        """
+        if self.args.loss_func in ['generative', 'mix']:
+            sample = super().encode(example, priming=priming, labeled=labeled)
+            if self.split == 'train':
+                sample['label'] = 0
+            return sample
+
+        if not priming:
+            assert not labeled, "'labeled' can only be set to true if 'priming' is also set to true"
+
+        tokenizer = self.tokenizer
+        prompt_id = tokenizer.num_tokens
+        raw_parts_a, raw_parts_b = self.get_parts(example)
+
+        raw_parts_a = [
+            x if isinstance(x, tuple) else (x, False) for x in raw_parts_a
+        ]
+
+        def encode_input(raw_parts):
+            parts = []
+            for x, s in raw_parts:
+                if isinstance(x, str):
+                    x = tokenizer.EncodeAsIds(x)
+                elif isinstance(x, int):
+                    x = [prompt_id] * x
+                else:
+                    pass
+                parts.append((x, s))
+            return parts
+
+        parts_a = encode_input(raw_parts_a)
+        if self.prefix_prompt > 0:
+            parts_a = [([prompt_id] * self.prefix_prompt, False)] + parts_a
+        parts_b = None
+        if raw_parts_b:
+            raw_parts_b = [
+                x if isinstance(x, tuple) else (x, False) for x in raw_parts_b
+            ]
+            parts_b = encode_input(raw_parts_b)
+        answer = self.get_answers(example)[0]
+        answer_ids = get_verbalization_ids(
+            answer, tokenizer, force_single_token=False)
+        answer_ids = answer_ids + [tokenizer.get_command('eop').Id]
+        self.num_truncated += self.truncate(
+            parts_a, parts_b, answer_ids, max_length=self.max_seq_length)
+        tokens_a = [token_id for part, _ in parts_a for token_id in part]
+        tokens_b = [token_id for part, _ in parts_b
+                    for token_id in part] if parts_b else None
+        data = build_input_from_ids(
+            tokens_a,
+            tokens_b,
+            answer_ids,
+            self.max_seq_length,
+            self.tokenizer,
+            args=self.args,
+            add_cls=True,
+            add_sep=False,
+            add_piece=True)
+        ids, types, paddings, position_ids, sep, target_ids, loss_masks = data
+        prompt_pos = [
+            idx for idx, token in enumerate(ids) if token == prompt_id
+        ]
+        ids = [token if token != prompt_id else 0 for token in ids]
+        if example.label is not None:
+            label = self.label_list.index(example.label)
+        else:
+            label = 0
+        return {
+            'text': np.array(ids, dtype=np.int64),
+            'target': np.array(target_ids, dtype=np.int64),
+            'attention_mask': np.array(sep, dtype=np.int64),
+            'loss_mask': np.array(loss_masks, dtype=np.int64),
+            'position_id': np.array(position_ids, dtype=np.int64),
+            'prompt_pos': np.array(prompt_pos, dtype=np.int64),
+            'label': label,
+            'uid': example.guid
+        }
+
+    def verbalize(self, label) -> List[str]:
+        return []
+
+
+class RecordPVP(PVP):
+
+    @property
+    def is_multi_token(self):
+        return True
+
+    def get_answers(self, example: InputExample):
+        choices = example.meta['candidates']
+        choices = [' ' + choice for choice in choices]
+        return choices
+
+    def get_parts(self, example: InputExample) -> FilledPattern:
+        premise = self.shortenable(example.text_a)
+
+        assert '@placeholder' in example.text_b, f'question "{example.text_b}" does not contain a @placeholder token'
+        question_a, question_b = example.text_b.split('@placeholder')
+        return [premise, ' ' + question_a.rstrip(), [self.mask],
+                question_b], []
+
+    def verbalize(self, label) -> List[str]:
+        return []
+
+
+class RacePVP(PVP):
+
+    @property
+    def is_multi_token(self):
+        return True
+
+    @staticmethod
+    def available_patterns():
+        return [0, 1]
+
+    def get_answers(self, example: InputExample):
+        choices = example.meta['choices']
+        choices = [' ' + choice for choice in choices]
+        return choices
+
+    def get_parts(self, example: InputExample) -> FilledPattern:
+        context = self.shortenable(example.text_a)
+        question = ' ' + example.text_b
+
+        if '_' in question:
+            left, right = question.split('_', maxsplit=1)
+            if self.pattern_id == 0:
+                return [context], [
+                    self.shortenable(left.rstrip()), [self.mask],
+                    self.shortenable(right)
+                ]
+            else:
+                left = left.rstrip()
+                if left:
+                    left = self.lowercase_first(left)
+                return [context], [
+                    ' Based on the previous passage,',
+                    self.shortenable(left), [self.mask],
+                    self.shortenable(right)
+                ]
+        else:
+            if self.pattern_id == 0:
+                return [context], [
+                    ' Question:',
+                    self.shortenable(question), ' Answer:', [self.mask]
+                ]
+            else:
+                return [context], [
+                    ' Based on the previous passage,',
+                    self.shortenable(question), [self.mask]
+                ]
+
+    def verbalize(self, label) -> List[str]:
+        return []
+
+
+class RtePVP(PVP):
+    VERBALIZER = {'not_entailment': [' No'], 'entailment': [' Yes']}
+
+    @staticmethod
+    def available_patterns():
+        return [0, 1, 2, 3, 4]
+
+    @property
+    def spell_length(self):
+        return self.num_prompt_tokens + self.prefix_prompt
+
+    def get_parts(self, example: InputExample) -> FilledPattern:
+        # switch text_a and text_b to get the correct order
+        text_a = example.text_a
+        text_b = example.text_b.rstrip(string.punctuation)
+        if self.pattern_id == 0:
+            parts_a, parts_b = [None, '"',
+                                self.shortenable(text_b), '" ?'], [
+                                    None, [self.mask], ',', None, ' "',
+                                    self.shortenable(text_a), '"'
+                                ]  # noqa
+        elif self.pattern_id == 1:
+            parts_a, parts_b = [None, self.shortenable(text_b), '?'], [
+                None, [self.mask], ',', None,
+                self.shortenable(' ' + text_a)
+            ]
+        elif self.pattern_id == 2:
+            parts_a, parts_b = [None, '"',
+                                self.shortenable(text_b), '" ?'], [
+                                    None, [self.mask], '. "', None,
+                                    self.shortenable(text_a), '"'
+                                ]  # noqa
+        elif self.pattern_id == 3:
+            parts_a, parts_b = [None, self.shortenable(text_b), '?'], [
+                None, [self.mask], '.', None,
+                self.shortenable(' ' + text_a)
+            ]
+        elif self.pattern_id == 4:
+            parts_a, parts_b = [
+                None,
+                self.shortenable(text_a), None, ' question:',
+                self.shortenable(' ' + text_b), ' True or False?', None,
+                ' answer:', [self.mask]
+            ], []
+        else:
+            raise NotImplementedError(self.pattern_id)
+        parts_a, parts_b = self.replace_prompt_tokens(parts_a, parts_b)
+        return parts_a, parts_b
+
+    def verbalize(self, label) -> List[str]:
+        if self.pattern_id == 4:
+            return [' true'] if label == 'entailment' else [' false']
+        return RtePVP.VERBALIZER[label]
+
+
+class CbPVP(RtePVP):
+    VERBALIZER = {
+        'contradiction': [' No'],
+        'entailment': [' Yes'],
+        'neutral': [' Maybe']
+    }
+
+    @staticmethod
+    def available_patterns():
+        return [0, 1, 2, 3, 4]
+
+    def get_parts(self, example: InputExample) -> FilledPattern:
+        if self.pattern_id == 4:
+            text_a = self.shortenable(example.text_a)
+            text_b = self.shortenable(' ' + example.text_b)
+            parts_a, parts_b = [
+                None, text_a, None, ' question:', text_b,
+                ' true, false or neither?', None, ' answer:', [self.mask]
+            ], []
+            parts_a, parts_b = self.replace_prompt_tokens(parts_a, parts_b)
+            return parts_a, parts_b
+        return super().get_parts(example)
+
+    def verbalize(self, label) -> List[str]:
+        if self.pattern_id == 4:
+            return [' true'] if label == 'entailment' else [
+                ' false'
+            ] if label == 'contradiction' else [' neither']
+        return CbPVP.VERBALIZER[label]
+
+
+class BoolQPVP(PVP):
+    VERBALIZER_A = {'false': [' No'], 'true': [' Yes']}
+
+    VERBALIZER_B = {'false': [' false'], 'true': [' true']}
+
+    @staticmethod
+    def available_patterns():
+        return [0, 1, 2, 3, 4, 5]
+
+    @property
+    def spell_length(self):
+        return self.num_prompt_tokens + self.prefix_prompt
+
+    def get_parts(self, example: InputExample) -> FilledPattern:
+        passage = example.text_a
+        question = example.text_b
+
+        if self.pattern_id < 2:
+            parts_a, parts_b = [
+                None,
+                self.shortenable(passage), None, ' Question:',
+                self.shortenable(' ' + question), '? Answer:', None,
+                [self.mask], '.'
+            ], []
+        elif self.pattern_id < 4:
+            parts_a, parts_b = [
+                None,
+                self.shortenable(passage), ' Based on the previous passage,',
+                None,
+                self.shortenable(' ' + question), '?', None, [self.mask], '.'
+            ], []
+        elif self.pattern_id < 6:
+            parts_a, parts_b = [
+                'Based on the following passage', None,
+                self.shortenable(' ' + question), '?', None, [self.mask], '.',
+                None,
+                self.shortenable(' ' + passage)
+            ], []
+        else:
+            raise NotImplementedError(self.pattern_id)
+        parts_a, parts_b = self.replace_prompt_tokens(parts_a, parts_b)
+        return parts_a, parts_b
+
+    def verbalize(self, label) -> List[str]:
+        if self.pattern_id == 0 or self.pattern_id == 2 or self.pattern_id == 4:
+            return BoolQPVP.VERBALIZER_A[label]
+        else:
+            return BoolQPVP.VERBALIZER_B[label]
+
+
+class MultiRcPVP(PVP):
+    VERBALIZER = {0: [' No'], 1: [' Yes']}
+
+    @staticmethod
+    def available_patterns():
+        return [0, 1, 2, 3, 4]
+
+    @property
+    def spell_length(self):
+        return self.num_prompt_tokens + self.prefix_prompt
+
+    def get_parts(self, example: InputExample) -> FilledPattern:
+        passage = self.remove_final_punc(
+            self.shortenable(example.text_a.rstrip()))
+        question = self.remove_final_punc(example.text_b.rstrip())
+        answer = example.meta['answer']
+        if self.pattern_id == 0:
+            parts_a, parts_b = [
+                passage, '.', None, ' Question:', ' ' + question + '?', None,
+                ' Is it', ' ' + answer, '?', None, [self.mask], '.'
+            ], []
+        elif self.pattern_id == 1:
+            parts_a, parts_b = [
+                passage, '.', None, ' Question:', ' ' + question, '?',
+                None, ' Is the correct answer "', answer, '"?', None,
+                [self.mask], '.'
+            ], []
+        elif self.pattern_id == 2:
+            parts_a, parts_b = [
+                passage, '. Based on the previous passage,', None,
+                ' ' + question, '?', None, ' Is "', answer,
+                '" a correct answer?', None, [self.mask], '.'
+            ], []
+        elif self.pattern_id == 3:
+            parts_a, parts_b = [
+                None, passage, None, ' ' + question, '- [', [self.mask], ']',
+                None, answer
+            ], []
+        elif self.pattern_id == 4:
+            parts_a, parts_b = [
+                passage, '.', None, ' Question:', ' ' + question, '?', None,
+                ' ' + answer, '?', None, [self.mask], '.'
+            ], []
+        else:
+            raise NotImplementedError(self.pattern_id)
+        parts_a, parts_b = self.replace_prompt_tokens(parts_a, parts_b)
+        return parts_a, parts_b
+
+    def verbalize(self, label) -> List[str]:
+        if self.pattern_id == 3:
+            return [' False'] if label == 0 else [' True']
+        return MultiRcPVP.VERBALIZER[label]
+
+
+class WicPVP(PVP):
+    VERBALIZER_A = {'false': [' No'], 'true': [' Yes']}
+    VERBALIZER_B = {'false': ['2'], 'true': ['b']}
+
+    @staticmethod
+    def available_patterns():
+        return [0, 1, 2]
+
+    @property
+    def spell_length(self):
+        return self.num_prompt_tokens + self.prefix_prompt
+
+    def get_parts(self, example: InputExample) -> FilledPattern:
+        text_a = example.text_a
+        text_b = example.text_b
+        word = example.meta['word']
+
+        if self.pattern_id == 0:
+            parts_a, parts_b = [
+                None,
+                self.shortenable('"' + text_a + '" / "' + text_b + '"'), None,
+                ' Similar sense of "' + word + '"?', None, [self.mask], '.'
+            ], []
+        elif self.pattern_id == 1:
+            parts_a, parts_b = [
+                self.shortenable(text_a), None,
+                self.shortenable(' ' + text_b), None,
+                ' Does ' + word + ' have the same meaning in both sentences?',
+                None, [self.mask]
+            ], []
+        elif self.pattern_id == 2:
+            parts_a, parts_b = [
+                None, word, ' .', None, ' Sense (1) (a) "',
+                self.shortenable(text_a), '"', None, ' (', [self.mask], ') "',
+                text_b, '"'
+            ], []
+        else:
+            raise NotImplementedError(self.pattern_id)
+        parts_a, parts_b = self.replace_prompt_tokens(parts_a, parts_b)
+        return parts_a, parts_b
+
+    def verbalize(self, label) -> List[str]:
+        if self.pattern_id == 2:
+            return WicPVP.VERBALIZER_B[label]
+        return WicPVP.VERBALIZER_A[label]
+
+
+class AgnewsPVP(PVP):
+    VERBALIZER = {
+        '1': [' World'],
+        '2': [' Sports'],
+        '3': [' Business'],
+        '4': [' Tech']
+    }
+
+    @staticmethod
+    def available_patterns():
+        return [0, 1, 2, 3, 4, 5]
+
+    def get_parts(self, example: InputExample) -> FilledPattern:
+
+        text_a = self.shortenable(example.text_a)
+        text_b = self.shortenable(example.text_b)
+
+        if self.pattern_id == 0:
+            return [[self.mask], ':', text_a, text_b], []
+        elif self.pattern_id == 1:
+            return [[self.mask], ' News:', text_a, text_b], []
+        elif self.pattern_id == 2:
+            return [text_a, '(', [self.mask], ')', text_b], []
+        elif self.pattern_id == 3:
+            return [text_a, text_b, '(', [self.mask], ')'], []
+        elif self.pattern_id == 4:
+            return ['[ Category:', [self.mask], ']', text_a, text_b], []
+        elif self.pattern_id == 5:
+            return [[self.mask], '-', text_a, text_b], []
+        else:
+            raise ValueError('No pattern implemented for id {}'.format(
+                self.pattern_id))
+
+    def verbalize(self, label) -> List[str]:
+        return AgnewsPVP.VERBALIZER[label]
+
+
+class YahooPVP(PVP):
+    VERBALIZER = {
+        '1': [' Society'],
+        '2': [' Science'],
+        '3': [' Health'],
+        '4': [' Education'],
+        '5': [' Computer'],
+        '6': [' Sports'],
+        '7': [' Business'],
+        '8': [' Entertainment'],
+        '9': [' Relationship'],
+        '10': [' Politics'],
+    }
+
+    @staticmethod
+    def available_patterns():
+        return [0, 1, 2, 3, 4, 5]
+
+    def get_parts(self, example: InputExample) -> FilledPattern:
+
+        text_a = self.shortenable(example.text_a)
+        text_b = self.shortenable(example.text_b)
+
+        if self.pattern_id == 0:
+            return [[self.mask], ':', text_a, text_b], []
+        elif self.pattern_id == 1:
+            return [[self.mask], ' Question:', text_a, text_b], []
+        elif self.pattern_id == 2:
+            return [text_a, '(', [self.mask], ')', text_b], []
+        elif self.pattern_id == 3:
+            return [text_a, text_b, '(', [self.mask], ')'], []
+        elif self.pattern_id == 4:
+            return ['[ Category:', [self.mask], ']', text_a, text_b], []
+        elif self.pattern_id == 5:
+            return [[self.mask], '-', text_a, text_b], []
+        else:
+            raise ValueError('No pattern implemented for id {}'.format(
+                self.pattern_id))
+
+    def verbalize(self, label) -> List[str]:
+        return YahooPVP.VERBALIZER[label]
+
+
+class MnliPVP(PVP):
+    VERBALIZER_A = {
+        'contradiction': [' Wrong'],
+        'entailment': [' Right'],
+        'neutral': [' Maybe']
+    }
+    VERBALIZER_B = {
+        'contradiction': [' No'],
+        'entailment': [' Yes'],
+        'neutral': [' Maybe']
+    }
+
+    @staticmethod
+    def available_patterns():
+        return [0, 1, 2, 3]
+
+    def get_parts(self, example: InputExample) -> FilledPattern:
+        text_a = self.shortenable(self.remove_final_punc(example.text_a))
+        text_b = self.shortenable(example.text_b)
+
+        if self.pattern_id == 0 or self.pattern_id == 2:
+            return ['"', text_a, '" ?'], [[self.mask], ', "', text_b, '"']
+        elif self.pattern_id == 1 or self.pattern_id == 3:
+            return [text_a, '?'], [[self.mask], ',', text_b]
+
+    def verbalize(self, label) -> List[str]:
+        if self.pattern_id == 0 or self.pattern_id == 1:
+            return MnliPVP.VERBALIZER_A[label]
+        return MnliPVP.VERBALIZER_B[label]
+
+
+class YelpPolarityPVP(PVP):
+    VERBALIZER = {'1': [' bad'], '2': [' good']}
+
+    @staticmethod
+    def available_patterns():
+        return [0, 1, 2, 3]
+
+    def get_parts(self, example: InputExample) -> FilledPattern:
+        text = self.shortenable(example.text_a)
+
+        if self.pattern_id == 0:
+            return ['It was', [self.mask], '.', text], []
+        elif self.pattern_id == 1:
+            return [text, '. All in all, it was', [self.mask], '.'], []
+        elif self.pattern_id == 2:
+            return ['Just', [self.mask], '!'], [text]
+        elif self.pattern_id == 3:
+            return [text], [' In summary, the restaurant is', [self.mask], '.']
+        else:
+            raise ValueError('No pattern implemented for id {}'.format(
+                self.pattern_id))
+
+    def verbalize(self, label) -> List[str]:
+        return YelpPolarityPVP.VERBALIZER[label]
+
+
+class YelpFullPVP(YelpPolarityPVP):
+    VERBALIZER = {
+        '1': [' terrible'],
+        '2': [' bad'],
+        '3': [' okay'],
+        '4': [' good'],
+        '5': [' great']
+    }
+
+    def verbalize(self, label) -> List[str]:
+        return YelpFullPVP.VERBALIZER[label]
+
+
+class XStancePVP(PVP):
+    VERBALIZERS = {
+        'en': {
+            'FAVOR': ['Yes'],
+            'AGAINST': ['No']
+        },
+        'de': {
+            'FAVOR': ['Ja'],
+            'AGAINST': ['Nein']
+        },
+        'fr': {
+            'FAVOR': ['Oui'],
+            'AGAINST': ['Non']
+        }
+    }
+
+    @staticmethod
+    def available_patterns():
+        return [0, 1, 2, 3, 4, 5]
+
+    def get_parts(self, example: InputExample) -> FilledPattern:
+
+        text_a = self.shortenable(example.text_a)
+        text_b = self.shortenable(example.text_b)
+
+        if self.pattern_id == 0 or self.pattern_id == 2 or self.pattern_id == 4:
+            return ['"', text_a, '"'], [[self.mask], '. "', text_b, '"']
+        elif self.pattern_id == 1 or self.pattern_id == 3 or self.pattern_id == 5:
+            return [text_a], [[self.mask], '.', text_b]
+
+    def verbalize(self, label) -> List[str]:
+        lang = 'de' if self.pattern_id < 2 else 'en' if self.pattern_id < 4 else 'fr'
+        return XStancePVP.VERBALIZERS[lang][label]
+
+
+class Sst2PVP(PVP):
+    VERBALIZER_A = {'0': [' terrible'], '1': [' great']}
+
+    VERBALIZER_B = {'0': [' bad'], '1': [' good']}
+
+    @staticmethod
+    def available_patterns():
+        return [0, 1]
+
+    def get_parts(self, example: InputExample) -> FilledPattern:
+        text = self.shortenable(example.text_a)
+        if self.pattern_id == 0 or self.pattern_id == 1:
+            return [text, ' It was', [self.mask], '.'], []
+        else:
+            raise ValueError('No pattern implemented for id {}'.format(
+                self.pattern_id))
+
+    def verbalize(self, label) -> List[str]:
+        if self.pattern_id == 0:
+            return Sst2PVP.VERBALIZER_A[label]
+        else:
+            return Sst2PVP.VERBALIZER_B[label]
+
+
+class ColaPVP(PVP):
+    VERBALIZER = {'0': [' incorrect'], '1': [' correct']}
+
+    def get_parts(self, example: InputExample) -> FilledPattern:
+        text = self.shortenable(example.text_a)
+        if self.pattern_id == 0:
+            return ['"', text, '"', ' This is', [self.mask], '.'], []
+        else:
+            raise ValueError('No pattern implemented for id {}'.format(
+                self.pattern_id))
+
+    def verbalize(self, label) -> List[str]:
+        return ColaPVP.VERBALIZER[label]
+
+
+class MrpcPVP(PVP):
+    VERBALIZER = {'0': [' No'], '1': [' Yes']}
+
+    @staticmethod
+    def available_patterns():
+        return [0, 1]
+
+    def get_parts(self, example: InputExample) -> FilledPattern:
+        text_a = self.shortenable(example.text_a)
+        if self.pattern_id == 0:
+            text_b = self.shortenable(self.lowercase_first(example.text_b))
+            return [text_a], [[self.mask], ', ', text_b]
+        elif self.pattern_id == 1:
+            text_b = self.shortenable(
+                self.remove_final_punc(self.lowercase_first(example.text_b)))
+            return [text_a], [' Does it mean that', text_b, '?', [self.mask]]
+        else:
+            raise ValueError('No pattern implemented for id {}'.format(
+                self.pattern_id))
+
+    def verbalize(self, label) -> List[str]:
+        return MrpcPVP.VERBALIZER[label]
+
+
+class QqpPVP(PVP):
+    VERBALIZER = {'0': [' No'], '1': [' Yes']}
+
+    @staticmethod
+    def available_patterns():
+        return [0, 1]
+
+    def get_parts(self, example: InputExample) -> FilledPattern:
+        text_a = self.shortenable(example.text_a)
+        text_b = self.shortenable(self.lowercase_first(example.text_b))
+        if self.pattern_id == 0:
+            return [text_a], [' Do you mean ', text_b, [self.mask], '.']
+        elif self.pattern_id == 1:
+            return [text_a], [[self.mask], ', ', text_b]
+        else:
+            raise ValueError('No pattern implemented for id {}'.format(
+                self.pattern_id))
+
+    def verbalize(self, label) -> List[str]:
+        return QqpPVP.VERBALIZER[label]
+
+
+class QnliPVP(PVP):
+    VERBALIZER = {'not_entailment': [' No'], 'entailment': [' Yes']}
+
+    @staticmethod
+    def available_patterns():
+        return [0, 1, 2]
+
+    def get_parts(self, example: InputExample) -> FilledPattern:
+        question = self.remove_final_punc(example.text_a)
+        passage = example.text_b
+        if self.pattern_id == 0:
+            return [
+                self.shortenable(passage), ' Question:',
+                self.shortenable(' ' + question), '? Do you know the answer?',
+                [self.mask], '.'
+            ], []
+        elif self.pattern_id == 1:
+            return [
+                self.shortenable(passage),
+                ' Based on the previous passage, do you know the answer',
+                self.shortenable(' ' + question), '?', [self.mask], '.'
+            ], []
+        elif self.pattern_id == 2:
+            return [
+                'Based on the following passage, do you know the answer',
+                self.shortenable(' ' + question), '?', [self.mask], '.',
+                self.shortenable(' ' + passage)
+            ], []
+        else:
+            raise ValueError('No pattern implemented for id {}'.format(
+                self.pattern_id))
+
+    def verbalize(self, label) -> List[str]:
+        return QnliPVP.VERBALIZER[label]
+
+
+class SquadPVP(PVP):
+
+    @property
+    def is_multi_token(self):
+        return True
+
+    def get_answers(self, example: InputExample):
+        target = ' ' + example.meta['answer']['text']
+        answers = [target]
+        return answers
+
+    def get_parts(self, example: InputExample) -> FilledPattern:
+        context = self.shortenable(example.text_a)
+        question = example.text_b
+        return [context, ' ' + question, [self.mask], '.'], []
+
+    def verbalize(self, label) -> List[str]:
+        return []
+
+
+def get_verbalization_ids(word: str, tokenizer,
+                          force_single_token: bool) -> Union[int, List[int]]:
+    """
+    Get the token ids corresponding to a verbalization
+
+    :param word: the verbalization
+    :param tokenizer: the tokenizer to use
+    :param force_single_token: whether it should be enforced that the verbalization corresponds to a single token.
+           If set to true, this method returns a single int instead of a list and throws an error if the word
+           corresponds to multiple tokens.
+    :return: either the list of token ids or the single token id corresponding to this word
+    """
+    ids = tokenizer.EncodeAsIds(word).tokenization
+    if not force_single_token:
+        return ids
+    assert len(ids) == 1, \
+        f'Verbalization "{word}" does not correspond to a single token, got {tokenizer.DecodeIds(ids)}'
+    verbalization_id = ids[0]
+    assert verbalization_id not in tokenizer.command_id_map, \
+        f'Verbalization {word} is mapped to a special token {tokenizer.IdToToken(verbalization_id)}'
+    return verbalization_id
+
+
+PVPS = {
+    'agnews': AgnewsPVP,
+    'mnli': MnliPVP,
+    'yelp-polarity': YelpPolarityPVP,
+    'yelp-full': YelpFullPVP,
+    'yahoo': YahooPVP,
+    'xstance': XStancePVP,
+    'xstance-de': XStancePVP,
+    'xstance-fr': XStancePVP,
+    'rte': RtePVP,
+    'wic': WicPVP,
+    'cb': CbPVP,
+    'wsc': WscPVP,
+    'boolq': BoolQPVP,
+    'copa': CopaPVP,
+    'multirc': MultiRcPVP,
+    'record': RecordPVP,
+    'ax-b': RtePVP,
+    'ax-g': RtePVP,
+    'sst2': Sst2PVP,
+    'cola': ColaPVP,
+    'mrpc': MrpcPVP,
+    'qqp': QqpPVP,
+    'qnli': QnliPVP,
+    'squad': SquadPVP,
+    'race': RacePVP,
+}
diff --git a/modelscope/models/nlp/mglm/test/__init__.py b/modelscope/models/nlp/mglm/test/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/nlp/mglm/test/test_block.py b/modelscope/models/nlp/mglm/test/test_block.py
new file mode 100644
index 00000000..ed4225da
--- /dev/null
+++ b/modelscope/models/nlp/mglm/test/test_block.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2022 Zhipu.AI
+
+import random
+from argparse import Namespace
+
+import numpy as np
+from blocklm_utils import ConstructBlockStrategy
+
+
+# rng = random.Random()
+# span_lengths = [2, 3, 4, 2, 3, 4]
+# length = 100
+#
+# counts = np.array([0] * length)
+# for _ in range(10000):
+#     rng.shuffle(span_lengths)
+#     spans = ConstructBlockStrategy.sample_spans(span_lengths, length, rng)
+#     for start, end in spans:
+#         counts[start: end] += 1
+# print(counts)
+def main():
+    args = Namespace()
+    args.seq_length = 10
+    args.eod_token = 0
+
+    strategy = ConstructBlockStrategy(
+        args, None, bert_ratio=0.4, max_seq_length=128)
+    counts = np.array([0] * 10)
+    for _ in range(10000):
+        spans = strategy.sample_span_in_document(
+            np.array([1, 2, 3, 0, 4, 5, 6, 7, 9, 0], dtype=np.long), [1, 1],
+            random.Random())
+        for start, end in spans:
+            counts[start:end] += 1
+
+    print(counts)
diff --git a/modelscope/models/nlp/mglm/test/test_rel_shift.py b/modelscope/models/nlp/mglm/test/test_rel_shift.py
new file mode 100644
index 00000000..00cbb9fe
--- /dev/null
+++ b/modelscope/models/nlp/mglm/test/test_rel_shift.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2022 Zhipu.AI
+
+import matplotlib.pyplot as plt
+import numpy as np
+from learning_rates import AnnealingLR
+from torch.nn.modules import Linear
+from torch.optim import Adam
+
+
+def main():
+    model = Linear(10, 10)
+    optimizer = Adam(model.parameters())
+    lr_scheduler = AnnealingLR(
+        optimizer,
+        start_lr=0.00015,
+        warmup_iter=3000,
+        num_iters=300000,
+        decay_style='cosine',
+        decay_ratio=0.1)
+    steps = np.arange(0, 400000, 10, dtype=np.long)
+    rates = []
+    for step in steps:
+        lr_scheduler.num_iters = step
+        rates.append(lr_scheduler.get_lr())
+    print(rates)
+    plt.plot(steps, rates)
+    plt.savefig('lr.pdf', format='pdf')
diff --git a/modelscope/models/nlp/mglm/train_utils.py b/modelscope/models/nlp/mglm/train_utils.py
new file mode 100644
index 00000000..c9c0de8e
--- /dev/null
+++ b/modelscope/models/nlp/mglm/train_utils.py
@@ -0,0 +1,472 @@
+# Copyright (c) 2022 Zhipu.AI
+
+import deepspeed
+import torch
+from apex.optimizers import FusedAdam as Adam
+from torch import distributed as dist
+
+from . import mpu
+from .fp16 import DynamicLossScaler, FP16_Module, FP16_Optimizer
+from .model import DistributedDataParallel as LocalDDP
+from .model import (GLMForMultiTokenCloze, GLMForMultiTokenClozeFast,
+                    GLMForSequenceClassification, GLMForSingleTokenCloze,
+                    GLMModel)
+from .model import PyTorchDistributedDataParallel as TorchDDP
+from .model import glm_get_params_for_weight_decay_optimization
+from .utils import get_checkpoint_iteration, get_checkpoint_name, print_rank_0
+
+
+def load_pretrained(model, checkpoint_path, args, task_tokens=None):
+    load_dir, tag, release, success = get_checkpoint_iteration(checkpoint_path)
+    checkpoint_name = get_checkpoint_name(load_dir, tag, release)
+    if mpu.get_data_parallel_rank() == 0:
+        print('global rank {} is loading pretrained model {}'.format(
+            torch.distributed.get_rank(), checkpoint_name))
+    # Load the checkpoint.
+    sd = torch.load(checkpoint_name, map_location='cpu')
+    if args.deepspeed:
+        model = model.module
+    if isinstance(model, TorchDDP):
+        model = model.module
+    if isinstance(model, FP16_Module):
+        model = model.module
+    if hasattr(model, 'model'):
+        model = model.model
+
+    # Model.
+    def extend_embedding_weights(state_weights, model_weights):
+        original_length = state_weights.shape[0]
+        assert original_length <= args.max_position_embeddings + 1
+        new_weights = model_weights.clone()
+        new_weights[:original_length] = state_weights
+        return new_weights
+
+    if args.block_lm:
+        if 'transformer.block_position_embeddings.weight' in sd['module']:
+            position_weights = sd['module'][
+                'transformer.position_embeddings.weight']
+            if args.max_position_embeddings + 1 > position_weights.shape[0]:
+                sd['module'][
+                    'transformer.position_embeddings.weight'] = extend_embedding_weights(
+                        position_weights,
+                        model.state_dict()
+                        ['transformer.position_embeddings.weight'].data)
+                print_rank_0(
+                    f'Extend position embedding to {args.max_position_embeddings + 1}'
+                )
+        if 'transformer.block_position_embeddings.weight' in sd['module']:
+            block_position_weights = sd['module'][
+                'transformer.block_position_embeddings.weight']
+            if args.max_position_embeddings + 1 > block_position_weights.shape[
+                    0]:
+                sd['module'][
+                    'transformer.block_position_embeddings.weight'] = extend_embedding_weights(
+                        block_position_weights,
+                        model.state_dict()
+                        ['transformer.block_position_embeddings.weight'].data)
+                print_rank_0(
+                    f'Extend block position embedding to {args.max_position_embeddings + 1}'
+                )
+    for key in list(model.state_dict().keys()):
+        print(key)
+        model.state_dict()[key.replace(
+            'mixins.block_position_embedding.block_position_embeddings.weight',
+            'transformer.block_position_embeddings.weight').replace(
+                'transformer.word_embeddings.weight',
+                'word_embeddings.weight')] = model.state_dict().pop(key)
+
+    missing_keys, unexpected_keys = model.load_state_dict(
+        sd['module'], strict=False)
+    if missing_keys or unexpected_keys:
+        print_rank_0(
+            f'Missing keys {missing_keys}, unexpected keys {unexpected_keys}')
+    if args.continuous_prompt and args.prompt_init:
+        model.prompt_spell.init_embedding(model.word_embeddings.weight.data,
+                                          task_tokens)
+
+
+def get_model(args,
+              model_type=None,
+              multi_token=True,
+              num_labels=None,
+              spell_length=None):
+    """Build the model."""
+    print_rank_0('building GPT2 model ...')
+    if args.pretrained_bert:
+        if model_type == 'multiple_choice':
+            model = BertForMultipleChoice.from_pretrained(
+                args.tokenizer_model_type,
+                cache_dir=args.cache_dir,
+                fp32_layernorm=args.fp32_layernorm,
+                fp32_embedding=args.fp32_embedding,
+                layernorm_epsilon=args.layernorm_epsilon)
+        elif model_type == 'classification':
+            model = BertForSequenceClassification.from_pretrained(
+                args.tokenizer_model_type,
+                cache_dir=args.cache_dir,
+                fp32_layernorm=args.fp32_layernorm,
+                fp32_embedding=args.fp32_embedding,
+                layernorm_epsilon=args.layernorm_epsilon,
+                num_labels=num_labels)
+        else:
+            raise NotImplementedError
+    else:
+        output_predict, paralle_output = True, True
+        if (model_type == 'multiple_choice'
+                or model_type == 'classification') and not args.cloze_eval:
+            output_predict = False
+        if model_type is not None:
+            paralle_output = False
+        if spell_length is not None:
+            print_rank_0(f'Continuous spell length {spell_length}')
+        model = GLMModel(
+            num_layers=args.num_layers,
+            vocab_size=args.vocab_size,
+            hidden_size=args.hidden_size,
+            num_attention_heads=args.num_attention_heads,
+            embedding_dropout_prob=args.hidden_dropout,
+            attention_dropout_prob=args.attention_dropout,
+            output_dropout_prob=args.hidden_dropout,
+            max_sequence_length=args.max_position_embeddings,
+            max_memory_length=args.mem_length,
+            checkpoint_activations=args.checkpoint_activations,
+            checkpoint_num_layers=args.checkpoint_num_layers,
+            parallel_output=paralle_output,
+            relative_encoding=args.transformer_xl,
+            block_position_encoding=args.block_lm and not args.masked_lm,
+            output_predict=output_predict,
+            spell_length=spell_length,
+            spell_func=args.prompt_func,
+            attention_scale=args.attention_scale)
+        if args.freeze_transformer:
+            model.freeze_transformer(
+                tune_prefix_layers=args.tune_prefix_layers)
+        if model_type is not None:
+            if model_type == 'multiple_choice':
+                if args.cloze_eval:
+                    if multi_token:
+                        if args.fast_decode:
+                            model = GLMForMultiTokenClozeFast(
+                                model, length_penalty=args.length_penalty)
+                        else:
+                            model = GLMForMultiTokenCloze(
+                                model, length_penalty=args.length_penalty)
+                    else:
+                        model = GLMForSingleTokenCloze(
+                            model, take_softmax=args.adapet)
+                else:
+                    model = GLMForSequenceClassification(
+                        model,
+                        args.hidden_size,
+                        args.output_dropout,
+                        args.pool_token,
+                        num_class=num_labels)
+            elif model_type == 'classification':
+                model = GLMForSequenceClassification(
+                    model,
+                    args.hidden_size,
+                    args.output_dropout,
+                    args.pool_token,
+                    num_class=num_labels)
+            elif model_type == 'generation':
+                pass
+            else:
+                raise NotImplementedError(model_type)
+
+    if mpu.get_data_parallel_rank() == 0:
+        print(
+            ' > number of parameters on model parallel rank {}: {}'.format(
+                mpu.get_model_parallel_rank(),
+                sum([p.nelement() for p in model.parameters()])),
+            flush=True)
+
+    # To prevent OOM for model sizes that cannot fit in GPU memory in full precision
+    if args.fp16:
+        model.half()
+
+    # GPU allocation.
+    model.cuda(torch.cuda.current_device())
+
+    # Fp16 conversion.
+    if args.fp16:
+        model = FP16_Module(model)
+
+    # Wrap model for distributed training.
+    if not args.deepspeed and (args.train_iters or args.epochs):
+        if args.DDP_impl == 'torch':
+            i = torch.cuda.current_device()
+            model = TorchDDP(
+                model,
+                device_ids=[i],
+                output_device=i,
+                process_group=mpu.get_data_parallel_group())
+        elif args.DDP_impl == 'local':
+            model = LocalDDP(model)
+        else:
+            print_rank_0('Skip DDP model')
+    return model
+
+
+def get_optimizer_param_groups(model):
+    # Build parameter groups (weight decay and non-decay).
+    while isinstance(model, (LocalDDP, TorchDDP, FP16_Module)):
+        model = model.module
+    param_groups = glm_get_params_for_weight_decay_optimization(model)
+
+    # Add model parallel attribute if it is not set.
+    for param_group in param_groups:
+        # print('## param_group', len(param_group['params']))
+        for param in param_group['params']:
+            if not hasattr(param, 'model_parallel'):
+                param.model_parallel = False
+
+    return param_groups
+
+
+def get_optimizer(param_groups, args):
+    """Set up the optimizer."""
+    if args.cpu_optimizer:
+        # Apex FusedAdam uses decoupled weight decay so use the same here
+        if args.cpu_torch_adam:
+            cpu_adam_optimizer = torch.optim.AdamW
+        else:
+            from deepspeed.ops.adam import DeepSpeedCPUAdam
+            cpu_adam_optimizer = DeepSpeedCPUAdam
+        optimizer = cpu_adam_optimizer(
+            param_groups, lr=args.lr, weight_decay=args.weight_decay)
+    else:
+        # Use FusedAdam.
+        if args.optimizer == 'adam':
+            optimizer = Adam(
+                param_groups,
+                lr=args.lr,
+                weight_decay=args.weight_decay,
+                betas=(args.adam_beta1, args.adam_beta2),
+                eps=args.adam_eps)
+        elif args.optimizer == 'adafactor':
+            from transformers import Adafactor
+            optimizer = Adafactor(
+                param_groups,
+                lr=args.lr,
+                relative_step=False,
+                warmup_init=False)
+        else:
+            raise NotImplementedError
+
+    print(f'Optimizer = {optimizer.__class__.__name__}')
+    if hasattr(args, 'deepspeed') and args.deepspeed:
+        raise NotImplementedError
+        # fp16 wrapper is not required for DeepSpeed.
+        # return optimizer
+
+    # Wrap into fp16 optimizer.
+    if args.fp16:
+        optimizer = FP16_Optimizer(
+            optimizer,
+            static_loss_scale=args.loss_scale,
+            dynamic_loss_scale=args.dynamic_loss_scale,
+            dynamic_loss_args={
+                'scale_window': args.loss_scale_window,
+                'min_scale': args.min_scale,
+                'delayed_shift': args.hysteresis
+            })
+
+    return optimizer
+
+
+def get_learning_rate_scheduler(optimizer, args):
+    """Build the learning rate scheduler."""
+
+    # Add linear learning rate scheduler.
+    if args.lr_decay_iters is not None:
+        num_iters = args.lr_decay_iters
+    else:
+        num_iters = args.train_iters
+    if args.finetune:
+        num_iters = num_iters // args.gradient_accumulation_steps
+    num_iters = max(1, num_iters)
+    init_step = -1
+    warmup_iter = args.warmup * num_iters
+    lr_scheduler = AnnealingLR(
+        optimizer,
+        start_lr=args.lr,
+        warmup_iter=warmup_iter,
+        num_iters=num_iters - warmup_iter,
+        decay_style=args.lr_decay_style,
+        last_iter=init_step,
+        decay_ratio=args.lr_decay_ratio)
+
+    return lr_scheduler
+
+
+def setup_model_and_optimizer(args,
+                              model_type=None,
+                              multi_token=True,
+                              num_labels=None,
+                              spell_length=None):
+    """Setup model and optimizer."""
+
+    model = get_model(
+        args,
+        model_type=model_type,
+        multi_token=multi_token,
+        num_labels=num_labels,
+        spell_length=spell_length)
+    param_groups = get_optimizer_param_groups(model)
+
+    if args.train_data is not None or args.data_dir is not None and (
+            args.epochs > 0 or args.train_iters > 0):
+        if args.deepspeed:
+            print_rank_0('DeepSpeed is enabled.')
+
+            model, optimizer, _, _ = deepspeed.initialize(
+                model=model,
+                model_parameters=param_groups,
+                args=args,
+                mpu=mpu,
+                dist_init_required=False)
+        else:
+            optimizer = get_optimizer(param_groups, args)
+        lr_scheduler = get_learning_rate_scheduler(optimizer, args)
+    else:
+        optimizer, lr_scheduler = None, None
+
+    return model, optimizer, lr_scheduler
+
+
+def backward_step(optimizer, model, lm_loss, args, timers):
+    """Backward step."""
+
+    # Total loss.
+    loss = lm_loss
+
+    # Backward pass.
+    if args.deepspeed:
+        model.backward(loss)
+    else:
+        # optimizer.zero_grad()
+        if args.fp16:
+            optimizer.backward(loss, update_master_grads=False)
+        else:
+            loss.backward()
+
+    if args.deepspeed or args.DDP_impl == 'torch':
+        # DeepSpeed backward propagation already addressed all reduce communication.
+        # Reset the timer to avoid breaking timer logs below.
+        timers('allreduce').reset()
+    else:
+        timers('allreduce').start()
+        model.allreduce_params(
+            reduce_after=False, fp32_allreduce=args.fp32_allreduce)
+        timers('allreduce').stop()
+
+    # Update master gradients.
+    if not args.deepspeed:
+        if args.fp16:
+            optimizer.update_master_grads()
+
+        # Clipping gradients helps prevent the exploding gradient.
+        if args.clip_grad > 0:
+            if not args.fp16:
+                mpu.clip_grad_norm(model.parameters(), args.clip_grad)
+            else:
+                optimizer.clip_master_grads(args.clip_grad)
+
+    return lm_loss
+
+
+def see_memory_usage(message, force=False):
+    if not force:
+        return
+    dist.barrier()
+    if dist.get_rank() == 0:
+        print(message)
+        print('Memory Allocated ',
+              torch.cuda.memory_allocated() / (1024 * 1024 * 1024),
+              'GigaBytes')
+        print('Max Memory Allocated ',
+              torch.cuda.max_memory_allocated() / (1024 * 1024 * 1024),
+              'GigaBytes')
+        print('Cache Allocated ',
+              torch.cuda.memory_cached() / (1024 * 1024 * 1024), 'GigaBytes')
+        print('Max cache Allocated ',
+              torch.cuda.max_memory_cached() / (1024 * 1024 * 1024),
+              'GigaBytes')
+        print(' ')
+        # input("Press Any Key To Continue ..")
+
+
+def train_step(data_iterator,
+               model,
+               optimizer,
+               lr_scheduler,
+               args,
+               timers,
+               forward_step_func,
+               mems=None,
+               single_step=False):
+    """Single training step."""
+    lm_loss_total, count = 0.0, 0
+    mems = [] if mems is None else mems
+    if not args.deepspeed:
+        optimizer.zero_grad()
+    while True:
+        skipped_iter, complete = 0, False
+        # Forward model for one step.
+        timers('forward').start()
+        lm_loss, mems, _ = forward_step_func(data_iterator, model, args,
+                                             timers, mems)
+        timers('forward').stop()
+        # print_rank_0("Forward step")
+        if not args.deepspeed:
+            lm_loss /= args.gradient_accumulation_steps
+
+        reduced_loss = lm_loss.detach().clone().view(1)
+        torch.distributed.all_reduce(
+            reduced_loss.data, group=mpu.get_data_parallel_group())
+        reduced_loss.data = reduced_loss.data / (
+            args.world_size / args.model_parallel_size)
+
+        if not DynamicLossScaler._has_inf_or_nan(reduced_loss):
+            lm_loss_total += reduced_loss
+            count += 1
+
+            # Calculate gradients, reduce across processes, and clip.
+            timers('backward').start()
+            backward_step(optimizer, model, lm_loss, args, timers)
+            timers('backward').stop()
+            # print_rank_0("Backward step")
+            # Update parameters.
+            timers('optimizer').start()
+            if args.deepspeed:
+                if model.is_gradient_accumulation_boundary():
+                    model.step()
+                    complete = True
+                    if not (args.fp16 and optimizer.overflow):
+                        lr_scheduler.step()
+                    else:
+                        skipped_iter = 1
+                else:
+                    model.step()
+            else:
+                if count == args.gradient_accumulation_steps:
+                    optimizer.step()
+                    complete = True
+                    # Update learning rate.
+                    if not (args.fp16 and optimizer.overflow):
+                        lr_scheduler.step()
+                    else:
+                        skipped_iter = 1
+            # print_rank_0("Optimizer step")
+            timers('optimizer').stop()
+            if complete:
+                break
+        else:
+            print_rank_0('Found NaN loss, skip backward')
+            del lm_loss, reduced_loss
+            mems = []
+        if single_step:
+            break
+    if args.deepspeed:
+        lm_loss_total = lm_loss_total / count
+    return lm_loss_total, skipped_iter, mems
diff --git a/modelscope/models/nlp/mglm/utils.py b/modelscope/models/nlp/mglm/utils.py
new file mode 100644
index 00000000..2bfcf8c0
--- /dev/null
+++ b/modelscope/models/nlp/mglm/utils.py
@@ -0,0 +1,529 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utilities for logging and serialization"""
+
+import os
+import random
+import subprocess
+import time
+
+import json
+import numpy as np
+import torch
+
+from . import mpu
+from .fp16 import FP16_Optimizer
+
+SUMMARY_WRITER_DIR_NAME = 'runs'
+
+
+def get_log_dir(name, base):
+    return os.path.join(base, SUMMARY_WRITER_DIR_NAME, name)
+
+
+def print_rank_0(message):
+    if torch.distributed.is_initialized():
+        if torch.distributed.get_rank() == 0:
+            print(message, flush=True)
+    else:
+        print(message, flush=True)
+
+
+def get_hostname():
+    hostname_cmd = ['hostname -I']
+    result = subprocess.check_output(hostname_cmd, shell=True)
+    master_addr = result.decode('utf-8').split()[0]
+    return master_addr
+
+
+def get_spare_port(args):
+    if torch.distributed.get_rank() == 0:
+        port = subprocess.check_output(['shuf -n 1 -i 10000-65535'],
+                                       shell=True)
+        port = int(port.strip())
+        if port == args.master_port:
+            port = subprocess.check_output(['shuf -n 1 -i 10000-65535'],
+                                           shell=True)
+            port = int(port.strip())
+        port = torch.cuda.LongTensor([port])
+    else:
+        port = torch.cuda.LongTensor([0])
+    torch.distributed.broadcast(port, 0)
+    port = port.item()
+    return port
+
+
+def print_and_save_args(args, verbose=True, log_dir=None):
+    """Print arguments."""
+    if verbose:
+        print('arguments:', flush=True)
+        for arg in vars(args):
+            dots = '.' * (29 - len(arg))
+            print(
+                '  {} {} {}'.format(arg, dots, getattr(args, arg)), flush=True)
+    if log_dir is not None:
+        json_file = os.path.join(log_dir, 'config.json')
+        with open(json_file, 'w') as output:
+            json.dump(vars(args), output, sort_keys=True)
+        if args.deepspeed and args.deepspeed_config is not None:
+            with open(args.deepspeed_config) as file:
+                deepspeed_config = json.load(file)
+            deepspeed_json_file = os.path.join(log_dir,
+                                               'config_gpt_large.json')
+            with open(deepspeed_json_file, 'w') as output:
+                json.dump(deepspeed_config, output)
+
+
+def print_params_min_max_norm(optimizer, iteration):
+    """Print min, max, and norm of all parameters."""
+    index = 0
+    rank = torch.distributed.get_rank()
+    string = 'iteration, rank, index, model-parallel,min, max, norm\n'
+    optimizer_ = optimizer
+    if isinstance(optimizer, FP16_Optimizer):
+        optimizer_ = optimizer.optimizer
+    for param_group in optimizer_.param_groups:
+        for param in param_group['params']:
+            index += 1
+            min_ = param.data.min()
+            max_ = param.data.max()
+            norm = param.data.norm()
+            string += '{:7d}, {:4d}, {:4d}, {:2d}, '.format(
+                iteration, rank, index, int(param.model_parallel))
+            string += '{:.6E}, {:.6E}, {:.6E}\n'.format(min_, max_, norm)
+    print(string, flush=True)
+
+
+class Timers:
+    """Group of timers."""
+
+    class Timer:
+        """Timer."""
+
+        def __init__(self, name):
+            self.name_ = name
+            self.elapsed_ = 0.0
+            self.started_ = False
+            self.start_time = time.time()
+
+        def start(self):
+            """Start the timer."""
+            assert not self.started_, 'timer has already been started'
+            torch.cuda.synchronize()
+            self.start_time = time.time()
+            self.started_ = True
+
+        def stop(self):
+            """Stop the timer."""
+            assert self.started_, 'timer is not started'
+            torch.cuda.synchronize()
+            self.elapsed_ += (time.time() - self.start_time)
+            self.started_ = False
+
+        def reset(self):
+            """Reset timer."""
+            self.elapsed_ = 0.0
+            self.started_ = False
+
+        def elapsed(self, reset=True):
+            """Calculate the elapsed time."""
+            started_ = self.started_
+            # If the timing in progress, end it first.
+            if self.started_:
+                self.stop()
+            # Get the elapsed time.
+            elapsed_ = self.elapsed_
+            # Reset the elapsed time
+            if reset:
+                self.reset()
+            # If timing was in progress, set it back.
+            if started_:
+                self.start()
+            return elapsed_
+
+    def __init__(self):
+        self.timers = {}
+
+    def __call__(self, name):
+        if name not in self.timers:
+            self.timers[name] = self.Timer(name)
+        return self.timers[name]
+
+    def log(self, names, normalizer=1.0, reset=True):
+        """Log a group of timers."""
+        assert normalizer > 0.0
+        string = 'time (ms)'
+        for name in names:
+            elapsed_time = self.timers[name].elapsed(
+                reset=reset) * 1000.0 / normalizer
+            string += ' | {}: {:.2f}'.format(name, elapsed_time)
+        print_rank_0(string)
+
+
+def report_memory(name):
+    """Simple GPU memory report."""
+
+    mega_bytes = 1024.0 * 1024.0
+    string = name + ' memory (MB)'
+    string += ' | allocated: {}'.format(torch.cuda.memory_allocated()
+                                        / mega_bytes)
+    string += ' | max allocated: {}'.format(torch.cuda.max_memory_allocated()
+                                            / mega_bytes)
+    string += ' | cached: {}'.format(torch.cuda.memory_cached() / mega_bytes)
+    string += ' | max cached: {}'.format(torch.cuda.memory_reserved()
+                                         / mega_bytes)
+    print_rank_0(string)
+
+
+def get_checkpoint_name(checkpoints_path,
+                        iteration,
+                        release=False,
+                        zero=False):
+    if release:
+        d = 'release'
+    else:
+        d = '{}'.format(iteration)
+    if zero:
+        dp_rank = mpu.get_data_parallel_rank()
+        d += '_zero_dp_rank_{}'.format(dp_rank)
+    return os.path.join(
+        checkpoints_path, d,
+        'mp_rank_{:02d}_model_states.pt'.format(mpu.get_model_parallel_rank()))
+
+
+def ensure_directory_exists(filename):
+    dirname = os.path.dirname(filename)
+    if not os.path.exists(dirname):
+        os.makedirs(dirname, exist_ok=True)
+
+
+def get_checkpoint_tracker_filename(checkpoints_path):
+    return os.path.join(checkpoints_path, 'latest_checkpointed_iteration.txt')
+
+
+def save_zero_checkpoint(args, iteration, optimizer):
+    zero_sd = {
+        'iteration': iteration,
+        'optimizer_state_dict': optimizer.state_dict()
+    }
+    zero_checkpoint_name = get_checkpoint_name(args.save, iteration, zero=True)
+    ensure_directory_exists(zero_checkpoint_name)
+    torch.save(zero_sd, zero_checkpoint_name)
+    print('  successfully saved {}'.format(zero_checkpoint_name))
+
+
+def save_checkpoint(iteration,
+                    model,
+                    optimizer,
+                    lr_scheduler,
+                    args,
+                    tag=None,
+                    barrier=True,
+                    only_changed_parameters=False,
+                    no_deepspeed=False,
+                    no_save_optim=False):
+    """Save a model checkpoint."""
+    if tag is None:
+        tag = str(iteration)
+    if args.deepspeed and not no_deepspeed:
+        save_ds_checkpoint(iteration, model, lr_scheduler, args, tag=tag)
+    else:
+        # Only rank zer0 of the data parallel writes to the disk.
+
+        if mpu.get_data_parallel_rank() == 0:
+            checkpoint_name = get_checkpoint_name(args.save, tag)
+            print(
+                'global rank {} is saving checkpoint at iteration {:7d} to {}'.
+                format(torch.distributed.get_rank(), iteration,
+                       checkpoint_name))
+            sd = {'iteration': iteration}
+            if args.deepspeed:
+                model = model.module
+            state_dict = model.state_dict()
+            if only_changed_parameters:
+                requires_grad_dict = {}
+                for name, parameter in model.named_parameters():
+                    requires_grad_dict[name] = parameter.requires_grad
+                state_dict = {
+                    key: value
+                    for key, value in state_dict.items()
+                    if requires_grad_dict[key]
+                }
+            sd['module'] = state_dict
+
+            # Optimizer stuff.
+            if not args.no_save_optim and not no_save_optim:
+                if optimizer is not None:
+                    sd['optimizer'] = optimizer.state_dict()
+                if lr_scheduler is not None:
+                    sd['lr_scheduler'] = lr_scheduler.state_dict()
+
+            # rng states.
+            if not args.no_save_rng:
+                sd['random_rng_state'] = random.getstate()
+                sd['np_rng_state'] = np.random.get_state()
+                sd['torch_rng_state'] = torch.get_rng_state()
+                sd['cuda_rng_state'] = torch.cuda.get_rng_state()
+                sd['rng_tracker_states'] = mpu.get_cuda_rng_tracker(
+                ).get_states()
+
+            ensure_directory_exists(checkpoint_name)
+            torch.save(sd, checkpoint_name)
+            print('  successfully saved {}'.format(checkpoint_name))
+
+    # Wait so everyone is done (necessary)
+    if barrier:
+        torch.distributed.barrier()
+    # And update the latest iteration
+    if torch.distributed.get_rank() == 0:
+        tracker_filename = get_checkpoint_tracker_filename(args.save)
+        with open(tracker_filename, 'w') as f:
+            f.write(tag)
+
+
+def save_ds_checkpoint(iteration, model, lr_scheduler, args, tag):
+    """Save a model checkpoint."""
+
+    sd = {}
+    sd['iteration'] = iteration
+    if lr_scheduler is not None:
+        sd['client_lr_scheduler'] = lr_scheduler.state_dict()
+    # rng states.
+    if not args.no_save_rng:
+        sd['random_rng_state'] = random.getstate()
+        sd['np_rng_state'] = np.random.get_state()
+        sd['torch_rng_state'] = torch.get_rng_state()
+        sd['cuda_rng_state'] = torch.cuda.get_rng_state()
+        sd['rng_tracker_states'] = mpu.get_cuda_rng_tracker().get_states()
+    model.save_checkpoint(args.save, tag, client_state=sd)
+
+
+def get_checkpoint_iteration(load_path):
+    # Read the tracker file and set the iteration.
+    tracker_filename = get_checkpoint_tracker_filename(load_path)
+    if not os.path.isfile(tracker_filename):
+        print_rank_0('WARNING: could not find the metadata file {} '.format(
+            tracker_filename))
+        if os.path.isdir(load_path):
+            path = os.path.normpath(load_path)
+            load_dir, tag = os.path.split(path)
+            print_rank_0(
+                'Try to directly load the checkpoint from the directory')
+            return load_dir, tag, False, True
+        print_rank_0('    will not load any checkpoints and will start from '
+                     'random')
+        return load_path, 0, False, False
+    with open(tracker_filename, 'r') as f:
+        metastring = f.read().strip()
+        release = metastring == 'release'
+        # try:
+        #     iteration = int(metastring)
+        # except ValueError:
+        #     release = metastring == 'release'
+        #     if not release:
+        #         print_rank_0('ERROR: Invalid metadata file {}. Exiting'.format(
+        #             tracker_filename))
+        #         exit()
+
+    # assert iteration > 0 or release, 'error parsing metadata file {}'.format(
+    #     tracker_filename)
+
+    return load_path, metastring, release, True
+
+
+def load_checkpoint(model,
+                    optimizer,
+                    lr_scheduler,
+                    args,
+                    no_deepspeed=False,
+                    no_load_optim=False):
+    """Load a model checkpoint."""
+
+    load_dir, tag, release, success = get_checkpoint_iteration(args.load)
+
+    if not success:
+        return 0
+
+    if args.deepspeed and not no_deepspeed:
+
+        checkpoint_name, sd = model.load_checkpoint(
+            load_dir,
+            tag,
+            load_optimizer_states=not args.no_load_optim and not no_load_optim,
+            load_lr_scheduler_states=not args.no_load_lr_scheduler)
+        if not args.no_load_lr_scheduler and 'client_lr_scheduler' in sd:
+            lr_scheduler.load_state_dict(sd['client_lr_scheduler'])
+            print_rank_0('Load lr scheduler state')
+        if checkpoint_name is None:
+            if mpu.get_data_parallel_rank() == 0:
+                print('Unable to load checkpoint.')
+            return tag
+
+    else:
+
+        # Checkpoint.
+        checkpoint_name = get_checkpoint_name(load_dir, tag, release)
+
+        if mpu.get_data_parallel_rank() == 0:
+            print('global rank {} is loading checkpoint {}'.format(
+                torch.distributed.get_rank(), checkpoint_name))
+
+        # Load the checkpoint.
+        sd = torch.load(checkpoint_name, map_location='cpu')
+
+        # Model.
+        if args.deepspeed:
+            model = model.module
+        missing_keys, unexpected_keys = model.load_state_dict(
+            sd['module'], strict=False)
+        if missing_keys or unexpected_keys:
+            print_rank_0(
+                f'Missing keys {missing_keys}, unexpected keys {unexpected_keys}'
+            )
+
+        # Optimizer.
+        if not release and not args.finetune and not args.no_load_optim and not no_load_optim:
+            try:
+                if optimizer is not None:
+                    optimizer.load_state_dict(sd['optimizer'])
+                if lr_scheduler is not None:
+                    lr_scheduler.load_state_dict(sd['lr_scheduler'])
+            except KeyError:
+                print_rank_0(
+                    'Unable to load optimizer from checkpoint {}, exiting. '
+                    'Specify --no-load-optim or --finetune to prevent '
+                    'attempting to load the optimizer '
+                    'state.'.format(checkpoint_name))
+
+    # Iterations.
+    if args.finetune or release:
+        iteration = 0
+    else:
+        try:
+            iteration = sd['iteration']
+        except KeyError:
+            try:  # Backward compatible with older checkpoints
+                iteration = sd['total_iters']
+            except KeyError:
+                print_rank_0(
+                    'A metadata file exists but Unable to load iteration '
+                    ' from checkpoint {}, starting from 0 iteration'.format(
+                        checkpoint_name))
+                iteration = 0
+
+    # rng states.
+    if not release and not args.finetune and not args.no_load_rng:
+        try:
+            random.setstate(sd['random_rng_state'])
+            np.random.set_state(sd['np_rng_state'])
+            torch.set_rng_state(sd['torch_rng_state'])
+            torch.cuda.set_rng_state(sd['cuda_rng_state'])
+            mpu.get_cuda_rng_tracker().set_states(sd['rng_tracker_states'])
+        except KeyError:
+            print_rank_0(
+                'Unable to load random state from checkpoint {}, exiting. '
+                'Specify --no-load-rng or --finetune to prevent '
+                'attempting to load the random '
+                'state.'.format(checkpoint_name))
+
+    if mpu.get_data_parallel_rank() == 0:
+        print('  successfully loaded {}'.format(checkpoint_name))
+
+    return iteration
+
+
+def load_weights(src, dst, dst2src=False):
+    """
+    Loads weights from src to dst via in place copy.
+    src is a huggingface gpt2model, while dst is one of our models.
+    dst2src=True loads parameters from our models into huggingface's.
+    ^dst2src is still untested
+    """
+    conv_layer = 'Conv1D' in str(type(src))
+    for n, p in src.named_parameters():
+        if dst2src:
+            data = dst._parameters[n].data
+            load = p.data
+        else:
+            data = p.data
+            load = dst._parameters[n].data
+        if conv_layer and 'weight' in n:
+            data = data.t().contiguous()
+        load.copy_(data)
+
+
+#        dst._parameters[n].data.copy_(data)
+
+
+def load_mlp(our, oai, dst2src=False):
+    load_weights(oai.c_fc, our.dense_h_to_4h, dst2src)
+    load_weights(oai.c_proj, our.dense_4h_to_h, dst2src)
+
+
+def load_attention(our, oai, dst2src=False):
+    load_weights(oai.c_attn, our.query_key_value, dst2src)
+    load_weights(oai.c_proj, our.dense, dst2src)
+
+
+def load_transformer_layer(our, oai, dst2src=False):
+    load_weights(oai.ln_1, our.input_layernorm, dst2src)
+    load_weights(oai.ln_2, our.post_attention_layernorm, dst2src)
+    load_mlp(our.mlp, oai.mlp, dst2src)
+    load_attention(our.attention, oai.attn, dst2src)
+
+
+def move_weights(our, oai, dst2src=False):
+    """
+    Loads weights from `oai` to `our` via in place copy.
+    `oai` is a huggingface gpt2model, while `our` is one of our models.
+    dst2src=True loads parameters from our models into huggingface's.
+    ^dst2src=True is still untested
+    """
+    #    while isinstance(our, (torchDDP, model.distributed.DistributedDataParallel, FP16_Module)):
+    #        our=our.module
+    transformer_model = oai.transformer
+    load_weights(transformer_model.ln_f, our.transformer.final_layernorm,
+                 dst2src)
+    load_weights(transformer_model.wte, our.word_embeddings, dst2src)
+    load_weights(transformer_model.wpe, our.position_embeddings, dst2src)
+
+    for our_layer, oai_layer in zip(our.transformer.layers, oai.transformer.h):
+        load_transformer_layer(our_layer, oai_layer, dst2src)
+
+
+def debug_finetune_data(local_vars, batch_id, tokenizer):
+    tokens, target_ids = local_vars['tokens'], local_vars['target_ids']
+    attention_mask, logit_mask, position_ids = local_vars[
+        'attention_mask'], local_vars['logit_mask'], local_vars['position_ids']
+    output_tokens = []
+    sep = attention_mask[batch_id].item()
+    for i, token in enumerate(tokens[batch_id][:sep].tolist()):
+        token = tokenizer.IdToToken(token)
+        if token == '[MASK]':
+            token = f'[{position_ids[batch_id][0, i].item()}]'
+        output_tokens.append(token)
+    print(' '.join(output_tokens))
+    target_positions = []
+    for i in range(sep, tokens.size(-1)):
+        if logit_mask[batch_id][i]:
+            target_positions.append(i)
+    print(target_positions)
+    print(tokenizer.DecodeIds(tokens[batch_id][target_positions].tolist()))
+    if len(target_ids.shape) > 2:
+        print(
+            tokenizer.DecodeIds(
+                target_ids[batch_id][target_positions].tolist()))
+    else:
+        print(tokenizer.DecodeIds(target_ids[batch_id].tolist()))
+    print(position_ids[batch_id][:, target_positions])
diff --git a/modelscope/outputs/outputs.py b/modelscope/outputs/outputs.py
index cbdeede4..b983125a 100644
--- a/modelscope/outputs/outputs.py
+++ b/modelscope/outputs/outputs.py
@@ -516,6 +516,12 @@ TASK_OUTPUTS = {
     # }
     Tasks.text_generation: [OutputKeys.TEXT],
 
+    # summarization result for single sample
+    # {
+    #   "text": "this is the text generated by a model."
+    # }
+    Tasks.text_summarization: [OutputKeys.TEXT],
+
     # text generation result for single sample
     # {
     #   "text": "北京"
diff --git a/modelscope/pipelines/nlp/__init__.py b/modelscope/pipelines/nlp/__init__.py
index 7b726308..1206ae08 100644
--- a/modelscope/pipelines/nlp/__init__.py
+++ b/modelscope/pipelines/nlp/__init__.py
@@ -31,6 +31,7 @@ if TYPE_CHECKING:
     from .translation_pipeline import TranslationPipeline
     from .word_segmentation_pipeline import WordSegmentationPipeline
     from .zero_shot_classification_pipeline import ZeroShotClassificationPipeline
+    from .mglm_text_summarization_pipeline import MGLMTextSummarizationPipeline
     from .multilingual_word_segmentation_pipeline import MultilingualWordSegmentationPipeline, \
         WordSegmentationThaiPipeline
 
@@ -71,6 +72,7 @@ else:
         'word_segmentation_pipeline': ['WordSegmentationPipeline'],
         'zero_shot_classification_pipeline':
         ['ZeroShotClassificationPipeline'],
+        'mglm_text_summarization_pipeline': ['MGLMTextSummarizationPipeline'],
         'multilingual_word_segmentation_pipeline': [
             'MultilingualWordSegmentationPipeline',
             'WordSegmentationThaiPipeline'
diff --git a/modelscope/pipelines/nlp/mglm_text_summarization_pipeline.py b/modelscope/pipelines/nlp/mglm_text_summarization_pipeline.py
new file mode 100644
index 00000000..c6d03077
--- /dev/null
+++ b/modelscope/pipelines/nlp/mglm_text_summarization_pipeline.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2022 Zhipu.AI
+
+from typing import Any, Dict, Optional, Union
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.base import Model
+from modelscope.models.nlp import MGLMForTextSummarization
+from modelscope.pipelines.base import Pipeline, Tensor
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import (MGLMSummarizationPreprocessor,
+                                      Preprocessor)
+from modelscope.utils.constant import Tasks
+
+__all__ = ['MGLMTextSummarizationPipeline']
+
+
+@PIPELINES.register_module(
+    group_key=Tasks.text_summarization,
+    module_name=Pipelines.mglm_text_summarization)
+class MGLMTextSummarizationPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[MGLMForTextSummarization, str],
+                 preprocessor: [Preprocessor] = None,
+                 *args,
+                 **kwargs):
+        model = MGLMForTextSummarization(model) if isinstance(model,
+                                                              str) else model
+        self.model = model
+        self.model.eval()
+        if preprocessor is None:
+            preprocessor = MGLMSummarizationPreprocessor()
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+
+    # define the forward pass
+    def forward(self, inputs: Union[Dict, str],
+                **forward_params) -> Dict[str, Any]:
+        inputs = {'text': inputs} if isinstance(inputs, str) else inputs
+        return self.model.generate(inputs)
+
+    # format the outputs from pipeline
+    def postprocess(self, input, **kwargs) -> Dict[str, Any]:
+        return input
diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py
index e568098f..0db1c7e0 100644
--- a/modelscope/preprocessors/__init__.py
+++ b/modelscope/preprocessors/__init__.py
@@ -18,16 +18,16 @@ if TYPE_CHECKING:
     from .nlp import (
         DocumentSegmentationPreprocessor, FaqQuestionAnsweringPreprocessor,
         FillMaskPoNetPreprocessor, NLPPreprocessor,
-        NLPTokenizerPreprocessorBase, TextRankingPreprocessor,
-        RelationExtractionPreprocessor, SentenceEmbeddingPreprocessor,
-        SequenceClassificationPreprocessor, TokenClassificationPreprocessor,
-        TextErrorCorrectionPreprocessor, TextGenerationPreprocessor,
-        Text2TextGenerationPreprocessor, Tokenize,
+        NLPTokenizerPreprocessorBase, PassageRankingPreprocessor,
+        TextRankingPreprocessor, RelationExtractionPreprocessor,
+        SentenceEmbeddingPreprocessor, SequenceClassificationPreprocessor,
+        TokenClassificationPreprocessor, TextErrorCorrectionPreprocessor,
+        TextGenerationPreprocessor, Text2TextGenerationPreprocessor, Tokenize,
         WordSegmentationBlankSetToLabelPreprocessor,
-        ZeroShotClassificationPreprocessor, TextGenerationJiebaPreprocessor,
-        SentencePiecePreprocessor, DialogIntentPredictionPreprocessor,
-        DialogModelingPreprocessor, DialogStateTrackingPreprocessor,
-        ConversationalTextToSqlPreprocessor,
+        MGLMSummarizationPreprocessor, ZeroShotClassificationPreprocessor,
+        TextGenerationJiebaPreprocessor, SentencePiecePreprocessor,
+        DialogIntentPredictionPreprocessor, DialogModelingPreprocessor,
+        DialogStateTrackingPreprocessor, ConversationalTextToSqlPreprocessor,
         TableQuestionAnsweringPreprocessor, NERPreprocessorViet,
         NERPreprocessorThai, WordSegmentationPreprocessorThai)
     from .video import ReadVideoData, MovieSceneSegmentationPreprocessor
@@ -57,6 +57,7 @@ else:
             'TextErrorCorrectionPreprocessor', 'TextGenerationPreprocessor',
             'Tokenize', 'Text2TextGenerationPreprocessor',
             'WordSegmentationBlankSetToLabelPreprocessor',
+            'MGLMSummarizationPreprocessor',
             'ZeroShotClassificationPreprocessor',
             'TextGenerationJiebaPreprocessor', 'SentencePiecePreprocessor',
             'NERPreprocessorViet', 'NERPreprocessorThai',
diff --git a/modelscope/preprocessors/nlp/__init__.py b/modelscope/preprocessors/nlp/__init__.py
index d9c55fe1..7c48fb3c 100644
--- a/modelscope/preprocessors/nlp/__init__.py
+++ b/modelscope/preprocessors/nlp/__init__.py
@@ -29,6 +29,7 @@ if TYPE_CHECKING:
                         MultiWOZBPETextField, IntentBPETextField)
     from .space_T_en import ConversationalTextToSqlPreprocessor
     from .space_T_cn import TableQuestionAnsweringPreprocessor
+    from .mglm_summarization_preprocessor import MGLMSummarizationPreprocessor
 else:
     _import_structure = {
         'nlp_base': [
@@ -62,6 +63,7 @@ else:
         'text_error_correction': [
             'TextErrorCorrectionPreprocessor',
         ],
+        'mglm_summarization_preprocessor': ['MGLMSummarizationPreprocessor'],
         'token_classification_thai_preprocessor': [
             'NERPreprocessorThai',
             'WordSegmentationPreprocessorThai',
diff --git a/modelscope/preprocessors/nlp/mglm_summarization_preprocessor.py b/modelscope/preprocessors/nlp/mglm_summarization_preprocessor.py
new file mode 100644
index 00000000..0a68a9fa
--- /dev/null
+++ b/modelscope/preprocessors/nlp/mglm_summarization_preprocessor.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2022 Zhipu.AI
+
+import os.path as osp
+import re
+from typing import Any, Dict, Iterable, Optional, Tuple, Union
+
+from modelscope.metainfo import Models, Preprocessors
+from modelscope.outputs import OutputKeys
+from modelscope.preprocessors.base import Preprocessor
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.utils.config import Config, ConfigFields
+from modelscope.utils.constant import Fields, InputFields, ModeKeys, ModelFile
+from modelscope.utils.hub import get_model_type, parse_label_mapping
+from modelscope.utils.logger import get_logger
+from modelscope.utils.nlp import import_external_nltk_data
+from modelscope.utils.type_assert import type_assert
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.mglm_summarization)
+class MGLMSummarizationPreprocessor(Preprocessor):
+
+    def __init__(self, *args, **kwargs):
+        """preprocess the data
+        Args:
+            model_dir (str): model path
+        """
+        super().__init__(*args, **kwargs)
+
+    @type_assert(object, (str, tuple, Dict))
+    def __call__(self, data: Union[str, tuple, Dict]) -> Dict[str, Any]:
+        return data
diff --git a/requirements/nlp.txt b/requirements/nlp.txt
index 9a4abd71..80fee546 100644
--- a/requirements/nlp.txt
+++ b/requirements/nlp.txt
@@ -1,18 +1,25 @@
+boto3
 en_core_web_sm>=2.3.5
+fasttext
+filelock
+ftfy
 jieba>=0.42.1
-megatron_util
+matplotlib
+nltk
 pai-easynlp
+pandas
 # protobuf version beyond 3.20.0 is not compatible with TensorFlow 1.x, therefore is discouraged.
 protobuf>=3.19.0,<3.21.0
 pythainlp
 pyvi
-# rough-score was just recently updated from 0.0.4 to 0.0.7
-# which introduced compatability issues that are being investigated
-rouge_score<=0.0.4
+regex
 sacremoses>=0.0.41
+scikit_learn
+sentencepiece
 seqeval
 spacy>=2.3.5
 subword_nmt>=0.3.8
+termcolor
 text2sql_lgesql
 tokenizers
 transformers>=4.12.0
diff --git a/tests/pipelines/test_mglm_text_summarization.py b/tests/pipelines/test_mglm_text_summarization.py
new file mode 100644
index 00000000..47abc741
--- /dev/null
+++ b/tests/pipelines/test_mglm_text_summarization.py
@@ -0,0 +1,47 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import unittest
+
+from modelscope.models import Model
+from modelscope.pipelines import pipeline
+from modelscope.preprocessors import MGLMSummarizationPreprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class mGLMTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.output_dir = 'unittest_output'
+        os.makedirs(self.output_dir, exist_ok=True)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_mglm_with_name(self):
+        model = 'ZhipuAI/Multilingual-GLM-Summarization-zh'
+        preprocessor = MGLMSummarizationPreprocessor()
+        pipe = pipeline(
+            task=Tasks.text_summarization,
+            model=model,
+            preprocessor=preprocessor,
+        )
+        result = pipe(
+            '据中国载人航天工程办公室消息，北京时间2022年10月25日，梦天实验舱与长征五号B遥四运载火箭组合体已转运至发射区。后续将按计划开展发射前各项功能检查和联合测试等工作，计划于近日择机实施发射。目前，文昌航天发射场设施设备状态良好，参试各单位正在加紧开展任务准备，全力以赴确保空间站建造任务决战决胜。'  # noqa
+        )
+        print(result)
+
+        model = 'ZhipuAI/Multilingual-GLM-Summarization-en'
+        preprocessor = MGLMSummarizationPreprocessor()
+        pipe = pipeline(
+            task=Tasks.text_summarization,
+            model=model,
+            preprocessor=preprocessor,
+        )
+        result = pipe(
+            '据中国载人航天工程办公室消息，北京时间2022年10月25日，梦天实验舱与长征五号B遥四运载火箭组合体已转运至发射区。后续将按计划开展发射前各项功能检查和联合测试等工作，计划于近日择机实施发射。目前，文昌航天发射场设施设备状态良好，参试各单位正在加紧开展任务准备，全力以赴确保空间站建造任务决战决胜。'  # noqa
+        )
+        print(result)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 4b7e8e89aade38131e35e05d04fd4aa2dacca0c9 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Fri, 28 Oct 2022 21:44:33 +0800
Subject: [PATCH 04/46] [to #42322933] Fix some bugs when downgrade the version
 of some dependencies

1. Fix bug in model exporting
2. Skip some long trainings in test level 2
3. Refine some comments
4. Fix a bug that mode is not correct when saving checkpoints
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10564716
---
 modelscope/exporters/torch_model_exporter.py  | 86 +++++++++++++++++--
 modelscope/models/base/base_model.py          |  7 ++
 modelscope/models/nlp/bert/text_ranking.py    |  1 +
 .../nlp/structbert/text_classification.py     |  1 +
 modelscope/trainers/trainer.py                |  4 +-
 ...st_export_sbert_sequence_classification.py |  2 +-
 .../test_finetune_sequence_classification.py  |  2 +-
 tests/trainers/test_trainer_with_nlp.py       |  2 +-
 8 files changed, 92 insertions(+), 13 deletions(-)

diff --git a/modelscope/exporters/torch_model_exporter.py b/modelscope/exporters/torch_model_exporter.py
index 94ef277a..7bf6c0c0 100644
--- a/modelscope/exporters/torch_model_exporter.py
+++ b/modelscope/exporters/torch_model_exporter.py
@@ -7,9 +7,9 @@ from typing import Any, Dict, Mapping
 import torch
 from torch import nn
 from torch.onnx import export as onnx_export
-from torch.onnx.utils import _decide_input_format
 
 from modelscope.models import TorchModel
+from modelscope.outputs import ModelOutputBase
 from modelscope.pipelines.base import collate_fn
 from modelscope.utils.constant import ModelFile
 from modelscope.utils.logger import get_logger
@@ -102,6 +102,53 @@ class TorchModelExporter(Exporter):
         """
         return None
 
+    @staticmethod
+    def _decide_input_format(model, args):
+        import inspect
+
+        def _signature(model) -> inspect.Signature:
+            should_be_callable = getattr(model, 'forward', model)
+            if callable(should_be_callable):
+                return inspect.signature(should_be_callable)
+            raise ValueError('model has no forward method and is not callable')
+
+        try:
+            sig = _signature(model)
+        except ValueError as e:
+            logger.warn('%s, skipping _decide_input_format' % e)
+            return args
+        try:
+            ordered_list_keys = list(sig.parameters.keys())
+            if ordered_list_keys[0] == 'self':
+                ordered_list_keys = ordered_list_keys[1:]
+            args_dict: Dict = {}
+            if isinstance(args, list):
+                args_list = args
+            elif isinstance(args, tuple):
+                args_list = list(args)
+            else:
+                args_list = [args]
+            if isinstance(args_list[-1], dict):
+                args_dict = args_list[-1]
+                args_list = args_list[:-1]
+            n_nonkeyword = len(args_list)
+            for optional_arg in ordered_list_keys[n_nonkeyword:]:
+                if optional_arg in args_dict:
+                    args_list.append(args_dict[optional_arg])
+                # Check if this arg has a default value
+                else:
+                    param = sig.parameters[optional_arg]
+                    if param.default != param.empty:
+                        args_list.append(param.default)
+            args = args_list if isinstance(args, list) else tuple(args_list)
+        # Cases of models with no input args
+        except IndexError:
+            logger.warn('No input args, skipping _decide_input_format')
+        except Exception as e:
+            logger.warn('Skipping _decide_input_format\n {}'.format(e.args[0]))
+
+        return args
+
     def _torch_export_onnx(self,
                            model: nn.Module,
                            output: str,
@@ -179,16 +226,21 @@ class TorchModelExporter(Exporter):
             with torch.no_grad():
                 model.eval()
                 outputs_origin = model.forward(
-                    *_decide_input_format(model, dummy_inputs))
-            if isinstance(outputs_origin, Mapping):
-                outputs_origin = numpify_tensor_nested(
-                    list(outputs_origin.values()))
+                    *self._decide_input_format(model, dummy_inputs))
+            if isinstance(outputs_origin, (Mapping, ModelOutputBase)):
+                outputs_origin = list(
+                    numpify_tensor_nested(outputs_origin).values())
             elif isinstance(outputs_origin, (tuple, list)):
-                outputs_origin = numpify_tensor_nested(outputs_origin)
+                outputs_origin = list(numpify_tensor_nested(outputs_origin))
             outputs = ort_session.run(
                 onnx_outputs,
                 numpify_tensor_nested(dummy_inputs),
             )
+            outputs = numpify_tensor_nested(outputs)
+            if isinstance(outputs, dict):
+                outputs = list(outputs.values())
+            elif isinstance(outputs, tuple):
+                outputs = list(outputs)
 
             tols = {}
             if rtol is not None:
@@ -232,12 +284,26 @@ class TorchModelExporter(Exporter):
                 'Model property dummy_inputs must be set.')
         dummy_inputs = collate_fn(dummy_inputs, device)
         if isinstance(dummy_inputs, Mapping):
-            dummy_inputs = tuple(dummy_inputs.values())
+            dummy_inputs = self._decide_input_format(model, dummy_inputs)
+            dummy_inputs_filter = []
+            for _input in dummy_inputs:
+                if _input is not None:
+                    dummy_inputs_filter.append(_input)
+                else:
+                    break
+
+            if len(dummy_inputs) != len(dummy_inputs_filter):
+                logger.warn(
+                    f'Dummy inputs is not continuous in the forward method, '
+                    f'origin length: {len(dummy_inputs)}, '
+                    f'the length after filtering: {len(dummy_inputs_filter)}')
+            dummy_inputs = dummy_inputs_filter
+
         with torch.no_grad():
             model.eval()
             with replace_call():
                 traced_model = torch.jit.trace(
-                    model, dummy_inputs, strict=strict)
+                    model, tuple(dummy_inputs), strict=strict)
         torch.jit.save(traced_model, output)
 
         if validation:
@@ -249,6 +315,10 @@ class TorchModelExporter(Exporter):
                 outputs = numpify_tensor_nested(outputs)
                 outputs_origin = model.forward(*dummy_inputs)
                 outputs_origin = numpify_tensor_nested(outputs_origin)
+                if isinstance(outputs, dict):
+                    outputs = list(outputs.values())
+                if isinstance(outputs_origin, dict):
+                    outputs_origin = list(outputs_origin.values())
             tols = {}
             if rtol is not None:
                 tols['rtol'] = rtol
diff --git a/modelscope/models/base/base_model.py b/modelscope/models/base/base_model.py
index e01d1f05..1ca7e030 100644
--- a/modelscope/models/base/base_model.py
+++ b/modelscope/models/base/base_model.py
@@ -161,5 +161,12 @@ class Model(ABC):
         assert config is not None, 'Cannot save the model because the model config is empty.'
         if isinstance(config, Config):
             config = config.to_dict()
+        if 'preprocessor' in config and config['preprocessor'] is not None:
+            if 'mode' in config['preprocessor']:
+                config['preprocessor']['mode'] = 'inference'
+            elif 'val' in config['preprocessor'] and 'mode' in config[
+                    'preprocessor']['val']:
+                config['preprocessor']['val']['mode'] = 'inference'
+
         save_pretrained(self, target_folder, save_checkpoint_names,
                         save_function, config, **kwargs)
diff --git a/modelscope/models/nlp/bert/text_ranking.py b/modelscope/models/nlp/bert/text_ranking.py
index d6bbf277..b5ac8d7e 100644
--- a/modelscope/models/nlp/bert/text_ranking.py
+++ b/modelscope/models/nlp/bert/text_ranking.py
@@ -36,6 +36,7 @@ class BertForTextRanking(BertForSequenceClassification):
                 output_attentions=None,
                 output_hidden_states=None,
                 return_dict=None,
+                *args,
                 **kwargs) -> AttentionTextClassificationModelOutput:
         outputs = self.base_model.forward(
             input_ids=input_ids,
diff --git a/modelscope/models/nlp/structbert/text_classification.py b/modelscope/models/nlp/structbert/text_classification.py
index 044cf8d0..8797beb3 100644
--- a/modelscope/models/nlp/structbert/text_classification.py
+++ b/modelscope/models/nlp/structbert/text_classification.py
@@ -109,6 +109,7 @@ class SbertForSequenceClassification(SbertPreTrainedModel):
                 output_attentions=None,
                 output_hidden_states=None,
                 return_dict=None,
+                *args,
                 **kwargs):
         r"""
         Args:
diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py
index aaf24cfa..7478d8e4 100644
--- a/modelscope/trainers/trainer.py
+++ b/modelscope/trainers/trainer.py
@@ -672,7 +672,7 @@ class EpochBasedTrainer(BaseTrainer):
                 self.model, cfg=cfg, default_args=default_args)
         except KeyError as e:
             self.logger.error(
-                f'Build optimizer error, the optimizer {cfg} is native torch optimizer, '
+                f'Build optimizer error, the optimizer {cfg} is a torch native component, '
                 f'please check if your torch with version: {torch.__version__} matches the config.'
             )
             raise e
@@ -682,7 +682,7 @@ class EpochBasedTrainer(BaseTrainer):
             return build_lr_scheduler(cfg=cfg, default_args=default_args)
         except KeyError as e:
             self.logger.error(
-                f'Build lr_scheduler error, the lr_scheduler {cfg} is native torch lr_scheduler, '
+                f'Build lr_scheduler error, the lr_scheduler {cfg} is a torch native component, '
                 f'please check if your torch with version: {torch.__version__} matches the config.'
             )
             raise e
diff --git a/tests/export/test_export_sbert_sequence_classification.py b/tests/export/test_export_sbert_sequence_classification.py
index 0e4f8349..7533732d 100644
--- a/tests/export/test_export_sbert_sequence_classification.py
+++ b/tests/export/test_export_sbert_sequence_classification.py
@@ -23,7 +23,7 @@ class TestExportSbertSequenceClassification(unittest.TestCase):
         shutil.rmtree(self.tmp_dir)
         super().tearDown()
 
-    @unittest.skip
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_export_sbert_sequence_classification(self):
         model = Model.from_pretrained(self.model_id)
         print(
diff --git a/tests/trainers/test_finetune_sequence_classification.py b/tests/trainers/test_finetune_sequence_classification.py
index 02dd9d2f..061d37d3 100644
--- a/tests/trainers/test_finetune_sequence_classification.py
+++ b/tests/trainers/test_finetune_sequence_classification.py
@@ -38,7 +38,7 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
         shutil.rmtree(self.tmp_dir)
         super().tearDown()
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skip
     def test_trainer_cfg_class(self):
         dataset = MsDataset.load('clue', subset_name='tnews')
         train_dataset = dataset['train']
diff --git a/tests/trainers/test_trainer_with_nlp.py b/tests/trainers/test_trainer_with_nlp.py
index d9d56b60..f1d9e414 100644
--- a/tests/trainers/test_trainer_with_nlp.py
+++ b/tests/trainers/test_trainer_with_nlp.py
@@ -72,7 +72,7 @@ class TestTrainerWithNlp(unittest.TestCase):
         output_dir = os.path.join(self.tmp_dir, ModelFile.TRAIN_OUTPUT_DIR)
         pipeline_sentence_similarity(output_dir)
 
-    @unittest.skipUnless(test_level() >= 3, 'skip test in current test level')
+    @unittest.skip
     def test_trainer_with_backbone_head(self):
         model_id = 'damo/nlp_structbert_sentiment-classification_chinese-base'
         kwargs = dict(

From 3791ee7ad2a1e4cc8f5586c7de138ef58a2db3db Mon Sep 17 00:00:00 2001
From: "mulin.lyh" <mulin.lyh@taobao.com>
Date: Sat, 29 Oct 2022 13:44:47 +0800
Subject: [PATCH 05/46] [to #45821936]fix: fix block user specify revision
 after release_datetime         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10572162

---
 modelscope/hub/api.py                       | 11 ++-
 tests/hub/test_hub_revision_release_mode.py | 84 ++++++++++++++++++++-
 2 files changed, 90 insertions(+), 5 deletions(-)

diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py
index 5923319d..dca6d099 100644
--- a/modelscope/hub/api.py
+++ b/modelscope/hub/api.py
@@ -382,10 +382,11 @@ class HubApi:
                 logger.info('Model revision not specified, use default: %s in development mode' % revision)
             if revision not in branches and revision not in tags:
                 raise NotExistError('The model: %s has no branch or tag : %s .' % revision)
+            logger.info('Development mode use revision: %s' % revision)
         else:
-            revisions = self.list_model_revisions(
-                model_id, cutoff_timestamp=release_timestamp, use_cookies=False if cookies is None else cookies)
-            if revision is None:
+            if revision is None:  # user not specified revision, use latest revision before release time
+                revisions = self.list_model_revisions(
+                    model_id, cutoff_timestamp=release_timestamp, use_cookies=False if cookies is None else cookies)
                 if len(revisions) == 0:
                     raise NoValidRevisionError('The model: %s has no valid revision!' % model_id)
                 # tags (revisions) returned from backend are guaranteed to be ordered by create-time
@@ -393,9 +394,13 @@ class HubApi:
                 revision = revisions[0]
                 logger.info('Model revision not specified, use the latest revision: %s' % revision)
             else:
+                # use user-specified revision
+                revisions = self.list_model_revisions(
+                    model_id, cutoff_timestamp=current_timestamp, use_cookies=False if cookies is None else cookies)
                 if revision not in revisions:
                     raise NotExistError(
                         'The model: %s has no revision: %s !' % (model_id, revision))
+                logger.info('Use user-specified model revision: %s' % revision)
         return revision
 
     def get_model_branches_and_tags(
diff --git a/tests/hub/test_hub_revision_release_mode.py b/tests/hub/test_hub_revision_release_mode.py
index 729a1861..73a0625e 100644
--- a/tests/hub/test_hub_revision_release_mode.py
+++ b/tests/hub/test_hub_revision_release_mode.py
@@ -115,7 +115,7 @@ class HubRevisionTest(unittest.TestCase):
             time.sleep(10)
             self.add_new_file_and_tag_to_repo()
             t2 = datetime.now().isoformat(sep=' ', timespec='seconds')
-            logger.info('Secnod time: %s' % t2)
+            logger.info('Second time: %s' % t2)
             # set
             release_datetime_backup = version.__release_datetime__
             logger.info('Origin __release_datetime__: %s'
@@ -142,6 +142,43 @@ class HubRevisionTest(unittest.TestCase):
             finally:
                 version.__release_datetime__ = release_datetime_backup
 
+    def test_snapshot_download_revision_user_set_revision(self):
+        with mock.patch.dict(os.environ, self.modified_environ, clear=True):
+            self.prepare_repo_data_and_tag()
+            t1 = datetime.now().isoformat(sep=' ', timespec='seconds')
+            logger.info('First time: %s' % t1)
+            time.sleep(10)
+            self.add_new_file_and_tag_to_repo()
+            t2 = datetime.now().isoformat(sep=' ', timespec='seconds')
+            logger.info('Secnod time: %s' % t2)
+            # set
+            release_datetime_backup = version.__release_datetime__
+            logger.info('Origin __release_datetime__: %s'
+                        % version.__release_datetime__)
+            try:
+                logger.info('Setting __release_datetime__ to: %s' % t1)
+                version.__release_datetime__ = t1
+                with tempfile.TemporaryDirectory() as temp_cache_dir:
+                    snapshot_path = snapshot_download(
+                        self.model_id,
+                        revision=self.revision,
+                        cache_dir=temp_cache_dir)
+                    assert os.path.exists(
+                        os.path.join(snapshot_path, download_model_file_name))
+                    assert not os.path.exists(
+                        os.path.join(snapshot_path, download_model_file_name2))
+                with tempfile.TemporaryDirectory() as temp_cache_dir:
+                    snapshot_path = snapshot_download(
+                        self.model_id,
+                        revision=self.revision2,
+                        cache_dir=temp_cache_dir)
+                    assert os.path.exists(
+                        os.path.join(snapshot_path, download_model_file_name))
+                    assert os.path.exists(
+                        os.path.join(snapshot_path, download_model_file_name2))
+            finally:
+                version.__release_datetime__ = release_datetime_backup
+
     def test_file_download_revision(self):
         with mock.patch.dict(os.environ, self.modified_environ, clear=True):
             self.prepare_repo_data_and_tag()
@@ -175,7 +212,6 @@ class HubRevisionTest(unittest.TestCase):
                         self.model_id,
                         download_model_file_name,
                         cache_dir=temp_cache_dir)
-                    print('Downloaded file path: %s' % file_path)
                     assert os.path.exists(file_path)
                     file_path = model_file_download(
                         self.model_id,
@@ -185,6 +221,50 @@ class HubRevisionTest(unittest.TestCase):
             finally:
                 version.__release_datetime__ = release_datetime_backup
 
+    def test_file_download_revision_user_set_revision(self):
+        with mock.patch.dict(os.environ, self.modified_environ, clear=True):
+            self.prepare_repo_data_and_tag()
+            t1 = datetime.now().isoformat(sep=' ', timespec='seconds')
+            logger.info('First time stamp: %s' % t1)
+            time.sleep(10)
+            self.add_new_file_and_tag_to_repo()
+            t2 = datetime.now().isoformat(sep=' ', timespec='seconds')
+            logger.info('Second time: %s' % t2)
+            release_datetime_backup = version.__release_datetime__
+            logger.info('Origin __release_datetime__: %s'
+                        % version.__release_datetime__)
+            try:
+                version.__release_datetime__ = t1
+                logger.info('Setting __release_datetime__ to: %s' % t1)
+                with tempfile.TemporaryDirectory() as temp_cache_dir:
+                    file_path = model_file_download(
+                        self.model_id,
+                        download_model_file_name,
+                        revision=self.revision,
+                        cache_dir=temp_cache_dir)
+                    assert os.path.exists(file_path)
+                    with self.assertRaises(NotExistError):
+                        model_file_download(
+                            self.model_id,
+                            download_model_file_name2,
+                            revision=self.revision,
+                            cache_dir=temp_cache_dir)
+                with tempfile.TemporaryDirectory() as temp_cache_dir:
+                    file_path = model_file_download(
+                        self.model_id,
+                        download_model_file_name,
+                        revision=self.revision2,
+                        cache_dir=temp_cache_dir)
+                    assert os.path.exists(file_path)
+                    file_path = model_file_download(
+                        self.model_id,
+                        download_model_file_name2,
+                        revision=self.revision2,
+                        cache_dir=temp_cache_dir)
+                    assert os.path.exists(file_path)
+            finally:
+                version.__release_datetime__ = release_datetime_backup
+
 
 if __name__ == '__main__':
     unittest.main()

From ae55fed2162bae29e7bda5ec821109ae5e7962e0 Mon Sep 17 00:00:00 2001
From: "wenmeng.zwm" <wenmeng.zwm@alibaba-inc.com>
Date: Sat, 29 Oct 2022 14:37:56 +0800
Subject: [PATCH 06/46] bumpy version to 1.0.0

---
 modelscope/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modelscope/version.py b/modelscope/version.py
index 541dfc57..ca813cc0 100644
--- a/modelscope/version.py
+++ b/modelscope/version.py
@@ -1,5 +1,5 @@
 # Make sure to modify __release_datetime__ to release time when making official release.
-__version__ = '0.5.0'
+__version__ = '1.0.0'
 # default release datetime for branches under active development is set
 # to be a time far-far-away-into-the-future
 __release_datetime__ = '2099-10-13 08:56:12'

From e07f3cdbf5a8a6de91fc19f32be14eda7a6e94c4 Mon Sep 17 00:00:00 2001
From: "wenmeng.zwm" <wenmeng.zwm@alibaba-inc.com>
Date: Sat, 29 Oct 2022 15:05:26 +0800
Subject: [PATCH 07/46] remove fasttext

---
 requirements/nlp.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/requirements/nlp.txt b/requirements/nlp.txt
index 80fee546..433f70f7 100644
--- a/requirements/nlp.txt
+++ b/requirements/nlp.txt
@@ -1,6 +1,5 @@
 boto3
 en_core_web_sm>=2.3.5
-fasttext
 filelock
 ftfy
 jieba>=0.42.1

From 29448c0f578757799e16d138d3b1af42db85fde5 Mon Sep 17 00:00:00 2001
From: Yingda Chen <yingda.chen@alibaba-inc.com>
Date: Sun, 30 Oct 2022 11:15:52 +0800
Subject: [PATCH 08/46] [to #42322933] disble vit

---
 tests/pipelines/test_face_emotion.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/pipelines/test_face_emotion.py b/tests/pipelines/test_face_emotion.py
index 907e15ee..96fe51a7 100644
--- a/tests/pipelines/test_face_emotion.py
+++ b/tests/pipelines/test_face_emotion.py
@@ -17,12 +17,12 @@ class FaceEmotionTest(unittest.TestCase):
         result = pipeline(input)
         print(result)
 
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skip('skip since the model is set to private for now')
     def test_run_modelhub(self):
         face_emotion = pipeline(Tasks.face_emotion, model=self.model)
         self.pipeline_inference(face_emotion, self.img)
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skip('skip since the model is set to private for now')
     def test_run_modelhub_default_model(self):
         face_emotion = pipeline(Tasks.face_emotion)
         self.pipeline_inference(face_emotion, self.img)

From 902019c2e01c8fa1583f91d2b772872db6ebc75a Mon Sep 17 00:00:00 2001
From: Yingda Chen <yingda.chen@alibaba-inc.com>
Date: Sun, 30 Oct 2022 13:55:49 +0800
Subject: [PATCH 09/46] [to #42322933] disble vgg19_fer

---
 tests/pipelines/test_facial_expression_recognition.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/pipelines/test_facial_expression_recognition.py b/tests/pipelines/test_facial_expression_recognition.py
index fff83ad6..f5151bef 100644
--- a/tests/pipelines/test_facial_expression_recognition.py
+++ b/tests/pipelines/test_facial_expression_recognition.py
@@ -23,7 +23,7 @@ class FacialExpressionRecognitionTest(unittest.TestCase):
         cv2.imwrite('result.png', img)
         print(f'output written to {osp.abspath("result.png")}')
 
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skip('skip since the model is set to private for now')
     def test_run_modelhub(self):
         fer = pipeline(
             Tasks.facial_expression_recognition, model=self.model_id)

From 9f7b8b86a33d65d6374b19b355a7ea9d1e572f80 Mon Sep 17 00:00:00 2001
From: Yingda Chen <yingda.chen@alibaba-inc.com>
Date: Sun, 30 Oct 2022 13:59:12 +0800
Subject: [PATCH 10/46] [to #42322933] disble 2dkeypoints training since
 face_2d_keypoints_dataset is set to be private

---
 tests/trainers/easycv/test_easycv_trainer_face_2d_keypoints.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/trainers/easycv/test_easycv_trainer_face_2d_keypoints.py b/tests/trainers/easycv/test_easycv_trainer_face_2d_keypoints.py
index 4dffa998..e4f0c57e 100644
--- a/tests/trainers/easycv/test_easycv_trainer_face_2d_keypoints.py
+++ b/tests/trainers/easycv/test_easycv_trainer_face_2d_keypoints.py
@@ -50,7 +50,8 @@ class EasyCVTrainerTestFace2DKeypoints(unittest.TestCase):
         trainer = build_trainer(trainer_name, kwargs)
         trainer.train()
 
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skip(
+        'skip since face_2d_keypoints_dataset is set to private for now')
     def test_trainer_single_gpu(self):
         temp_file_dir = tempfile.TemporaryDirectory()
         tmp_dir = temp_file_dir.name

From e2d35fbb14b342c8ffc214469bca622bf954983c Mon Sep 17 00:00:00 2001
From: "yichang.zyc" <yichang.zyc@alibaba-inc.com>
Date: Sun, 30 Oct 2022 21:51:11 +0800
Subject: [PATCH 11/46] =?UTF-8?q?[to=20#42322933]clip=E6=94=AF=E6=8C=81fin?=
 =?UTF-8?q?etune=20=20=20=20=20=20=20=20=20Link:=20https://code.alibaba-in?=
 =?UTF-8?q?c.com/Ali-MaaS/MaaS-lib/codereview/10572842?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 modelscope/metainfo.py                        |   6 +
 modelscope/metrics/builder.py                 |   1 +
 modelscope/metrics/inbatch_recall_metric.py   |  55 +++
 modelscope/models/multi_modal/clip/model.py   | 156 ++------
 .../multi_modal_embedding_pipeline.py         |  22 +-
 modelscope/preprocessors/multi_modal.py       | 177 +++++++++
 .../hooks/clip_clamp_logit_scale_hook.py      |  18 +
 .../trainers/multi_modal/clip/clip_trainer.py | 345 ++++++++++--------
 .../multi_modal/clip/clip_trainer_utils.py    | 211 ++++++-----
 tests/pipelines/test_multi_modal_embedding.py |   6 +-
 tests/trainers/test_clip_trainer.py           |  83 +++++
 11 files changed, 704 insertions(+), 376 deletions(-)
 create mode 100644 modelscope/metrics/inbatch_recall_metric.py
 create mode 100644 modelscope/trainers/hooks/clip_clamp_logit_scale_hook.py
 create mode 100644 tests/trainers/test_clip_trainer.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 3951541c..8c9964b8 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -389,6 +389,7 @@ class Preprocessors(object):
 
     # multi-modal preprocessor
     ofa_tasks_preprocessor = 'ofa-tasks-preprocessor'
+    clip_preprocessor = 'clip-preprocessor'
     mplug_tasks_preprocessor = 'mplug-tasks-preprocessor'
 
     # science preprocessor
@@ -428,6 +429,8 @@ class Metrics(object):
     image_inpainting_metric = 'image-inpainting-metric'
     # metric for ocr
     NED = 'ned'
+    # metric for cross-modal retrieval
+    inbatch_recall = 'inbatch_recall'
     # metric for referring-video-object-segmentation task
     referring_video_object_segmentation_metric = 'referring-video-object-segmentation-metric'
 
@@ -474,6 +477,9 @@ class Hooks(object):
     # Compression
     SparsityHook = 'SparsityHook'
 
+    # CLIP logit_scale clamp
+    ClipClampLogitScaleHook = 'ClipClampLogitScaleHook'
+
 
 class LR_Schedulers(object):
     """learning rate scheduler is defined here
diff --git a/modelscope/metrics/builder.py b/modelscope/metrics/builder.py
index 2b61c1ae..b9e402c5 100644
--- a/modelscope/metrics/builder.py
+++ b/modelscope/metrics/builder.py
@@ -24,6 +24,7 @@ class MetricKeys(object):
     ROUGE_1 = 'rouge-1'
     ROUGE_L = 'rouge-l'
     NED = 'ned'  # ocr metric
+    BatchAcc = 'inbatch_t2i_recall_at_1'
 
 
 task_default_metrics = {
diff --git a/modelscope/metrics/inbatch_recall_metric.py b/modelscope/metrics/inbatch_recall_metric.py
new file mode 100644
index 00000000..d098a883
--- /dev/null
+++ b/modelscope/metrics/inbatch_recall_metric.py
@@ -0,0 +1,55 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Dict
+
+import numpy as np
+import torch
+
+from modelscope.metainfo import Metrics
+from modelscope.outputs import OutputKeys
+from modelscope.utils.registry import default_group
+from .base import Metric
+from .builder import METRICS, MetricKeys
+
+
+@METRICS.register_module(
+    group_key=default_group, module_name=Metrics.inbatch_recall)
+class InbatchRecallMetric(Metric):
+    """The metric computation class for in-batch retrieval classes.
+
+    This metric class calculates in-batch image recall@1 for each input batch.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.inbatch_t2i_hitcnts = []
+        self.batch_sizes = []
+
+    def add(self, outputs: Dict, inputs: Dict):
+        image_features = outputs[OutputKeys.IMG_EMBEDDING]
+        text_features = outputs[OutputKeys.TEXT_EMBEDDING]
+
+        assert type(image_features) == torch.Tensor and type(
+            text_features) == torch.Tensor
+
+        with torch.no_grad():
+            logits_per_image = image_features @ text_features.t()
+            logits_per_text = logits_per_image.t()
+            batch_size = logits_per_image.shape[0]
+
+            ground_truth = torch.arange(batch_size).long()
+            ground_truth = ground_truth.to(image_features.device)
+
+            inbatch_t2i_hitcnt = (logits_per_text.argmax(-1) == ground_truth
+                                  ).sum().float().item()
+
+            self.inbatch_t2i_hitcnts.append(inbatch_t2i_hitcnt)
+            self.batch_sizes.append(batch_size)
+
+    def evaluate(self):
+        assert len(self.inbatch_t2i_hitcnts) == len(
+            self.batch_sizes) and len(self.batch_sizes) > 0
+        return {
+            MetricKeys.BatchAcc:
+            sum(self.inbatch_t2i_hitcnts) / sum(self.batch_sizes)
+        }
diff --git a/modelscope/models/multi_modal/clip/model.py b/modelscope/models/multi_modal/clip/model.py
index 92d9e11a..b1c84292 100644
--- a/modelscope/models/multi_modal/clip/model.py
+++ b/modelscope/models/multi_modal/clip/model.py
@@ -15,15 +15,13 @@
 
 import os
 from collections import OrderedDict
-from typing import Any, Dict, Iterable, List, Tuple, Union
+from typing import Any, Dict, Tuple, Union
 
 import json
 import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from PIL import Image
-from torchvision.transforms import Compose, Normalize, Resize, ToTensor
 
 from modelscope.metainfo import Models
 from modelscope.models import TorchModel
@@ -506,21 +504,6 @@ def convert_weights(model: nn.Module):
     model.apply(_convert_weights_to_fp16)
 
 
-def _convert_to_rgb(image):
-    return image.convert('RGB')
-
-
-def image_transform(image_size=224):
-    transform = Compose([
-        _convert_to_rgb,
-        Resize((image_size, image_size)),
-        ToTensor(),
-        Normalize((0.48145466, 0.4578275, 0.40821073),
-                  (0.26862954, 0.26130258, 0.27577711)),
-    ])
-    return transform
-
-
 @MODELS.register_module(Tasks.multi_modal_embedding, module_name=Models.clip)
 class CLIPForMultiModalEmbedding(TorchModel):
 
@@ -540,72 +523,40 @@ class CLIPForMultiModalEmbedding(TorchModel):
 
         with open(vision_model_config_file,
                   'r') as fv, open(text_model_config_file, 'r') as ft:
-            model_info = json.load(fv)
+            self.model_info = json.load(fv)
             for k, v in json.load(ft).items():
-                model_info[k] = v
-
-        # image preprocess
-        self.img_preprocess = image_transform(model_info['image_resolution'])
+                self.model_info[k] = v
 
-        # text tokenizer
         vocab_file = f'{model_dir}/{ModelFile.VOCAB_FILE}'
         self.tokenizer = FullTokenizer(vocab_file=vocab_file)
 
         # initialize the model
-        self.clip_model = CLIP(**model_info, tokenizer=self.tokenizer)
+        self.clip_model = CLIP(**self.model_info, tokenizer=self.tokenizer)
         convert_weights(self.clip_model)
 
         # restore the pretrained weight
         checkpoint = torch.load(
             f'{model_dir}/{ModelFile.TORCH_MODEL_BIN_FILE}', 'cpu')
-        sd = checkpoint['state_dict']
+        sd = checkpoint[
+            'state_dict'] if 'state_dict' in checkpoint else checkpoint
         if next(iter(sd.items()))[0].startswith('module'):
             sd = {k[len('module.'):]: v for k, v in sd.items()}
+        # support the finetuned model
+        if next(iter(sd.items()))[0].startswith('clip_model'):
+            sd = {k[len('clip_model.'):]: v for k, v in sd.items()}
         self.clip_model.load_state_dict(sd)
         self.clip_model.eval()
 
         # place the model
-        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
-        if self.device == 'cuda':
+        self.device = 'cuda:{}'.format(int(os.environ.get(
+            'LOCAL_RANK', 0))) if torch.cuda.is_available() else 'cpu'
+        if torch.cuda.is_available():
             self.clip_model.to(self.device)
-            logger.info('Use GPU for inference')
+            logger.info('Use GPU {} for finetuning & inference'.format(
+                int(os.environ.get('LOCAL_RANK', 0))))
         else:
             self.clip_model.float()
-            logger.info('Use CPU for inference')
-
-    def tokenize(self,
-                 texts: Union[str, List[str]],
-                 context_length: int = 52) -> torch.LongTensor:
-        """
-        Returns the tokenized representation of given input string(s)
-        Parameters
-        ----------
-        texts : Union[str, List[str]]
-            An input string or a list of input strings to tokenize
-        context_length : int
-            The context length to use; all baseline models use 24 as the context length
-        Returns
-        -------
-        A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]
-        """
-        if isinstance(texts, str):
-            texts = [texts]
-
-        all_tokens = []
-        for text in texts:
-            all_tokens.append(
-                [self.tokenizer.vocab['[CLS]']]
-                + self.tokenizer.convert_tokens_to_ids(
-                    self.tokenizer.tokenize(text))[:context_length - 2]
-                + [self.tokenizer.vocab['[SEP]']])
-
-        result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
-
-        for i, tokens in enumerate(all_tokens):
-            assert len(tokens) <= context_length
-            result[i, :len(tokens)] = torch.tensor(tokens)
-
-        return result
+            logger.info('Use CPU for finetuning & inference')
 
     def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
         from modelscope.outputs import OutputKeys
@@ -613,75 +564,36 @@ class CLIPForMultiModalEmbedding(TorchModel):
             OutputKeys.IMG_EMBEDDING: None,
             OutputKeys.TEXT_EMBEDDING: None
         }
-        if 'img' in input and input['img'] is not None:
-            image_input = input['img']
-
-            # single image input
-            if isinstance(image_input, Image.Image):
-                image_tensor = self.img_preprocess(image_input).unsqueeze(0)
-            # multi images input
-            elif isinstance(image_input, list):
-                if all([isinstance(elem, Image.Image)
-                        for elem in image_input]):
-                    image_tensor = torch.stack(
-                        [self.img_preprocess(elem) for elem in image_input],
-                        dim=0)
-                else:
-                    unsupported_elem_type = [
-                        type(elem) for elem in image_input
-                        if not isinstance(elem, Image.Image)
-                    ][0]
-                    raise TypeError(
-                        f'img should be PIL.Image or List[PIL.Image], \
-                            but got a List containing one {unsupported_elem_type}'
-                    )
-            # others
-            else:
-                raise TypeError(
-                    f'img should be PIL.Image or List[PIL.Image], but got {type(image_input)}'
-                )
-
-            image_tensor = image_tensor.to(self.device)
-
-            with torch.no_grad():
+        mode = input.get('mode', ModeKeys.INFERENCE)
+
+        # encode the image
+        if 'img' in input and isinstance(input['img'], torch.Tensor):
+            image_tensor = input['img'].to(self.device)
+            if image_tensor.dim() == 5 and image_tensor.shape[1] == 1:
+                image_tensor = image_tensor.squeeze(1)
+
+            with torch.autograd.set_grad_enabled(mode == ModeKeys.TRAIN):
                 image_features = self.clip_model.encode_image(image_tensor)
                 image_features /= image_features.norm(
                     dim=-1, keepdim=True)  # l2-normalize
 
             output[OutputKeys.IMG_EMBEDDING] = image_features
 
-        if 'text' in input and input['text'] is not None:
-            text_input = input['text']
-
-            # single text input
-            if isinstance(text_input, str):
-                text_tensor = self.tokenize(text_input)
-            # multi texts input
-            elif isinstance(text_input, list):
-                if all([isinstance(elem, str) for elem in text_input]):
-                    text_tensor = self.tokenize(text_input)
-                else:
-                    unsupported_elem_type = [
-                        type(elem) for elem in text_input
-                        if not isinstance(elem, str)
-                    ][0]
-                    raise TypeError(
-                        f'text should be str or List[str], but got a List containing one {unsupported_elem_type}'
-                    )
-            # others
-            else:
-                raise TypeError(
-                    f'text should be str or List[str], but got {type(text_input)}'
-                )
-
-            text_tensor = text_tensor.to(self.device)
-
-            with torch.no_grad():
+        if 'text' in input and isinstance(input['text'], torch.Tensor):
+            text_tensor = input['text'].to(self.device)
+            if text_tensor.dim() == 3 and text_tensor.shape[1] == 1:
+                text_tensor = text_tensor.squeeze(1)
+
+            with torch.autograd.set_grad_enabled(mode == ModeKeys.TRAIN):
                 text_features = self.clip_model.encode_text(text_tensor)
                 text_features /= text_features.norm(
                     dim=-1, keepdim=True)  # l2-normalize
             output[OutputKeys.TEXT_EMBEDDING] = text_features
 
+        if mode == ModeKeys.TRAIN:
+            output['logit_scale'] = (self.clip_model.logit_scale
+                                     * 1.0).exp().mean()
+
         return output
 
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
diff --git a/modelscope/pipelines/multi_modal/multi_modal_embedding_pipeline.py b/modelscope/pipelines/multi_modal/multi_modal_embedding_pipeline.py
index d3f15c23..18ee1dbf 100644
--- a/modelscope/pipelines/multi_modal/multi_modal_embedding_pipeline.py
+++ b/modelscope/pipelines/multi_modal/multi_modal_embedding_pipeline.py
@@ -1,10 +1,12 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
-from typing import Any, Dict
+from typing import Any, Dict, Optional, Union
 
 from modelscope.metainfo import Pipelines
+from modelscope.models.multi_modal.clip.model import CLIPForMultiModalEmbedding
 from modelscope.pipelines.base import Input, Model, Pipeline
 from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors.multi_modal import CLIPPreprocessor, Preprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.logger import get_logger
 
@@ -17,7 +19,10 @@ logger = get_logger()
     Tasks.multi_modal_embedding, module_name=Pipelines.multi_modal_embedding)
 class MultiModalEmbeddingPipeline(Pipeline):
 
-    def __init__(self, model: str, device: str = 'gpu'):
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: Optional[Preprocessor] = None,
+                 **kwargs):
         """
         use `model` and `preprocessor` to create a kws pipeline for prediction
         Args:
@@ -29,14 +34,17 @@ class MultiModalEmbeddingPipeline(Pipeline):
             pipe_model = model
         else:
             raise NotImplementedError('model must be a single str')
+        pipe_model.eval()
+        if preprocessor is None:
+            if isinstance(pipe_model, CLIPForMultiModalEmbedding):
+                preprocessor = CLIPPreprocessor(pipe_model.model_dir)
+            else:
+                raise NotImplementedError
 
-        super().__init__(model=pipe_model)
-
-    def preprocess(self, input: Input) -> Dict[str, Any]:
-        return input
+        super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs)
 
     def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
-        return self.model(input)
+        return self.model(self.preprocess(input))
 
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
         return inputs
diff --git a/modelscope/preprocessors/multi_modal.py b/modelscope/preprocessors/multi_modal.py
index 557b469a..17dffb48 100644
--- a/modelscope/preprocessors/multi_modal.py
+++ b/modelscope/preprocessors/multi_modal.py
@@ -3,8 +3,11 @@ import os.path as osp
 from io import BytesIO
 from typing import Any, Dict, List, Tuple, Union
 
+import json
 import torch
 from PIL import Image
+from timm.data import create_transform
+from torchvision.transforms import Compose, Normalize, Resize, ToTensor
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.metainfo import Preprocessors
@@ -107,6 +110,180 @@ class OfaPreprocessor(Preprocessor):
                               eos_idx=self.tokenizer.eos_token_id)
 
 
+def _convert_to_rgb(image):
+    return image.convert('RGB')
+
+
+@PREPROCESSORS.register_module(
+    Fields.multi_modal, module_name=Preprocessors.clip_preprocessor)
+class CLIPPreprocessor(Preprocessor):
+
+    def __init__(self,
+                 model_dir: str,
+                 mode=ModeKeys.INFERENCE,
+                 *args,
+                 **kwargs):
+        """preprocess the data
+
+        Args:
+            model_dir (str): model path
+            mode: preprocessor mode (model mode)
+        """
+        super().__init__(*args, **kwargs)
+        model_dir = model_dir if osp.exists(model_dir) else snapshot_download(
+            model_dir)
+        self.mode = mode
+        # text tokenizer
+        from modelscope.models.multi_modal.clip.bert_tokenizer import FullTokenizer
+        if 'tokenizer' in kwargs and isinstance(kwargs['tokenizer'],
+                                                FullTokenizer):
+            self.tokenizer = kwargs['tokenizer']
+        else:
+            vocab_file = f'{model_dir}/{ModelFile.VOCAB_FILE}'
+            self.tokenizer = FullTokenizer(vocab_file=vocab_file)
+        # image preprocessor
+        if 'resolution' in kwargs and isinstance(kwargs['resolution'], int):
+            self.image_resolution = kwargs['resolution']
+        else:
+            self.image_resolution = json.load(
+                open('{}/vision_model_config.json'.format(
+                    model_dir)))['image_resolution']
+        self.img_preprocess = self._build_image_transform()
+        # key mapping
+        # specify the input keys, compatible with training and inference whose key names may be different
+        self.input_keys = {'img': 'img', 'text': 'text'}
+
+    def _build_image_transform(self):
+
+        if self.mode == ModeKeys.TRAIN:
+            transform = create_transform(
+                input_size=self.image_resolution,
+                scale=(0.9, 1.0),
+                is_training=True,
+                color_jitter=None,
+                auto_augment='original',
+                interpolation='bicubic',
+                mean=(0.48145466, 0.4578275, 0.40821073),
+                std=(0.26862954, 0.26130258, 0.27577711),
+            )
+            transform = Compose(transform.transforms[:-3] + [_convert_to_rgb]
+                                + transform.transforms[-3:])
+        else:
+            transform = Compose([
+                Resize((self.image_resolution, self.image_resolution),
+                       interpolation=Image.BICUBIC),
+                _convert_to_rgb,
+                ToTensor(),
+                Normalize((0.48145466, 0.4578275, 0.40821073),
+                          (0.26862954, 0.26130258, 0.27577711)),
+            ])
+        return transform
+
+    def tokenize(self,
+                 texts: Union[str, List[str]],
+                 context_length: int = 52) -> torch.LongTensor:
+        """
+        Returns the tokenized representation of given input string(s)
+        Parameters
+        ----------
+        texts : Union[str, List[str]]
+            An input string or a list of input strings to tokenize
+        context_length : int
+            The context length to use; all baseline models use 24 as the context length
+        Returns
+        -------
+        A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]
+        """
+        if isinstance(texts, str):
+            texts = [texts]
+
+        all_tokens = []
+        for text in texts:
+            all_tokens.append(
+                [self.tokenizer.vocab['[CLS]']]
+                + self.tokenizer.convert_tokens_to_ids(
+                    self.tokenizer.tokenize(text))[:context_length - 2]
+                + [self.tokenizer.vocab['[SEP]']])
+
+        result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
+
+        for i, tokens in enumerate(all_tokens):
+            assert len(tokens) <= context_length
+            result[i, :len(tokens)] = torch.tensor(tokens)
+
+        return result
+
+    def set_input_img_key(self, new_key: str):
+        self.input_keys['img'] = new_key
+
+    def set_input_text_key(self, new_key: str):
+        self.input_keys['text'] = new_key
+
+    def __call__(self, input: Union[str, tuple, Dict[str, Any]], *args,
+                 **kwargs) -> Dict[str, Any]:
+        output = {}
+        # preprocess the image input
+        input_img_key = self.input_keys['img']
+        if input_img_key in input and input[input_img_key] is not None:
+            image_input = input[input_img_key]
+
+            # single image input
+            if isinstance(image_input, Image.Image):
+                image_tensor = self.img_preprocess(image_input).unsqueeze(0)
+            # multi images input
+            elif isinstance(image_input, list):
+                if all([isinstance(elem, Image.Image)
+                        for elem in image_input]):
+                    image_tensor = torch.stack(
+                        [self.img_preprocess(elem)
+                         for elem in image_input],  # noqa
+                        dim=0)  # noqa
+                else:
+                    unsupported_elem_type = [
+                        type(elem) for elem in image_input
+                        if not isinstance(elem, Image.Image)
+                    ][0]
+                    raise TypeError(
+                        f'img should be PIL.Image or List[PIL.Image], \
+                            but got a List containing one {unsupported_elem_type}'
+                    )
+            # others
+            else:
+                raise TypeError(
+                    f'img should be PIL.Image or List[PIL.Image], but got {type(image_input)}'
+                )
+            output['img'] = image_tensor
+
+        # preprocess the text input
+        input_text_key = self.input_keys['text']
+        if input_text_key in input and input[input_text_key] is not None:
+            text_input = input[input_text_key]
+
+            # single text input
+            if isinstance(text_input, str):
+                text_tensor = self.tokenize(text_input)
+            # multi texts input
+            elif isinstance(text_input, list):
+                if all([isinstance(elem, str) for elem in text_input]):
+                    text_tensor = self.tokenize(text_input)
+                else:
+                    unsupported_elem_type = [
+                        type(elem) for elem in text_input
+                        if not isinstance(elem, str)
+                    ][0]
+                    raise TypeError(
+                        f'text should be str or List[str], but got a List containing one {unsupported_elem_type}'
+                    )
+            # others
+            else:
+                raise TypeError(
+                    f'text should be str or List[str], but got {type(text_input)}'
+                )
+            output['text'] = text_tensor
+
+        return output
+
+
 @PREPROCESSORS.register_module(
     Fields.multi_modal, module_name=Preprocessors.mplug_tasks_preprocessor)
 class MPlugPreprocessor(Preprocessor):
diff --git a/modelscope/trainers/hooks/clip_clamp_logit_scale_hook.py b/modelscope/trainers/hooks/clip_clamp_logit_scale_hook.py
new file mode 100644
index 00000000..ce98e6c9
--- /dev/null
+++ b/modelscope/trainers/hooks/clip_clamp_logit_scale_hook.py
@@ -0,0 +1,18 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import torch
+
+from modelscope.metainfo import Hooks
+from modelscope.trainers.multi_modal.clip.clip_trainer import CLIPTrainer
+from .builder import HOOKS
+from .hook import Hook
+
+
+@HOOKS.register_module(module_name=Hooks.ClipClampLogitScaleHook)
+class ClipClampLogitScaleHook(Hook):
+    """ClipClampLogitScaleHook hook which performs clamp on CLIP logit scale parameter after update"""
+
+    def after_train_iter(self, trainer: CLIPTrainer):
+        """Called after every training iter to evaluate the results."""
+        unwrapped_model = getattr(trainer.model, 'module', trainer.model)
+        logit_scale = unwrapped_model.clip_model.logit_scale
+        logit_scale.data = torch.clamp(logit_scale.data, 0, 4.6052)
diff --git a/modelscope/trainers/multi_modal/clip/clip_trainer.py b/modelscope/trainers/multi_modal/clip/clip_trainer.py
index cbe83417..40c524ac 100644
--- a/modelscope/trainers/multi_modal/clip/clip_trainer.py
+++ b/modelscope/trainers/multi_modal/clip/clip_trainer.py
@@ -1,169 +1,206 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
+import math
 import os
-from typing import Dict, Optional
+from typing import Callable, Dict, Optional, Tuple, Union
 
 import torch
-import torch.distributed as dist
-from torch.utils.data import DataLoader
-from torch.utils.data.distributed import DistributedSampler
+from torch import distributed as dist
+from torch import nn
+from torch.utils.data import Dataset
 
 from modelscope.metainfo import Trainers
-from modelscope.models.base import Model
-from modelscope.trainers.base import BaseTrainer
+from modelscope.models.base import Model, TorchModel
+from modelscope.models.multi_modal.clip.model import convert_models_to_fp32
+from modelscope.msdatasets.ms_dataset import MsDataset
+from modelscope.preprocessors.base import Preprocessor
+from modelscope.preprocessors.multi_modal import CLIPPreprocessor
+from modelscope.trainers import EpochBasedTrainer
 from modelscope.trainers.builder import TRAINERS
+from modelscope.trainers.optimizer.builder import build_optimizer
 from modelscope.utils.config import Config
-from modelscope.utils.constant import ModeKeys
-from modelscope.utils.logger import get_logger
-from .clip_trainer_utils import ImageWithCaptionDataset, get_optimizer
+from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, ConfigKeys,
+                                       ModeKeys)
+from .clip_trainer_utils import get_loss, get_optimizer_params, get_schedule
 
-logger = get_logger()
+
+def exclude(n):
+    return 'bn' in n or 'ln' in n or 'bias' in n or 'logit_scale' in n
+
+
+def include(n):
+    return not exclude(n)
 
 
 @TRAINERS.register_module(module_name=Trainers.clip_multi_modal_embedding)
-class CLIPTrainer(BaseTrainer):
-
-    def __init__(self, cfg_file: str, model: str, device_id: int, *args,
-                 **kwargs):
-        super().__init__(cfg_file)
-
-        self.cfg = Config.from_file(cfg_file)
-        self.model = Model.from_pretrained(model)
-        self.device_id = device_id
-        self.total_epoch = self.cfg.train.epoch
-        self.train_batch_size = self.cfg.train.batch_size
-        self.val_batch_size = self.cfg.evaluation.batch_size
-        self.ckpt_dir = self.cfg.train.ckpt_dir
-
-        self.train_dataset = ImageWithCaptionDataset(
-            json_file='{}/{}'.format(self.cfg.dataset.root_dir,
-                                     self.cfg.dataset.train_set),
-            img_dir=self.cfg.dataset.root_dir,
-            phase=ModeKeys.TRAIN)
-        self.val_dataset = ImageWithCaptionDataset(
-            json_file='{}/{}'.format(self.cfg.dataset.root_dir,
-                                     self.cfg.dataset.val_set),
-            img_dir=self.cfg.dataset.root_dir,
-            phase=ModeKeys.EVAL)
-
-    def train(self, *args, **kwargs):
-        assert dist.is_initialized()
-
-        self.model.clip_model.train()
-        self.model.clip_model.to(self.device_id)
-        ddp_model = torch.nn.parallel.DistributedDataParallel(
-            self.model.clip_model, device_ids=[
-                self.device_id,
-            ])
-
-        optimizer = get_optimizer(ddp_model)
-
-        for epoch in range(self.total_epoch):
-            train_sampler = DistributedSampler(
-                dataset=self.train_dataset, shuffle=True)
-            train_sampler.set_epoch(epoch)
-
-            train_params = {
-                'pin_memory': True,
-                'collate_fn': None,
-                'batch_size': self.train_batch_size,
-                'shuffle': False,
-                'drop_last': True,
-                'sampler': train_sampler,
-                'num_workers': 8
+class CLIPTrainer(EpochBasedTrainer):
+
+    def __init__(
+            self,
+            model: Optional[Union[TorchModel, nn.Module, str]] = None,
+            cfg_file: Optional[str] = None,
+            arg_parse_fn: Optional[Callable] = None,
+            data_collator: Optional[Union[Callable, Dict[str,
+                                                         Callable]]] = None,
+            train_dataset: Optional[Union[MsDataset, Dataset]] = None,
+            eval_dataset: Optional[Union[MsDataset, Dataset]] = None,
+            preprocessor: Optional[Union[Preprocessor,
+                                         Dict[str, Preprocessor]]] = None,
+            optimizers: Tuple[torch.optim.Optimizer,
+                              torch.optim.lr_scheduler._LRScheduler] = (None,
+                                                                        None),
+            model_revision: Optional[str] = DEFAULT_MODEL_REVISION,
+            seed: int = 42,
+            **kwargs):
+        model = Model.from_pretrained(model, revision=model_revision)
+        # for training & eval, we convert the model from FP16 back to FP32
+        # to compatible with modelscope amp training
+        convert_models_to_fp32(model)
+        cfg = Config.from_file(cfg_file)
+        if 'work_dir' not in kwargs or len(kwargs['work_dir']) == 0:
+            work_dir = cfg.train.work_dir
+        else:
+            work_dir = kwargs['work_dir']
+
+        # fetch the model name of CLIP model (base, large or large-336)
+        model_name = cfg.pretrained_model.model_name
+
+        # world size
+        world_size = int(os.environ.get('WORLD_SIZE', 1))
+
+        # train step, optimizer and lr_scheduler
+        epoch_steps = math.ceil(
+            len(train_dataset) /  # noqa
+            (cfg.train.dataloader.batch_size_per_gpu * world_size))  # noqa
+        cfg.train.lr_scheduler.num_train_steps = epoch_steps * cfg.train.max_epochs
+
+        if optimizers[0] is None:
+            named_parameters = list(model.named_parameters())
+            gain_or_bias_params = [
+                p for n, p in named_parameters
+                if exclude(n) and p.requires_grad
+            ]
+            rest_params = [
+                p for n, p in named_parameters
+                if include(n) and p.requires_grad
+            ]
+            optimizer_hparams = get_optimizer_params(
+                model_name, cfg)  # lr, wd, beta1, beta2, eps
+            optimizer_args = {
+                'params': [
+                    {
+                        'params': gain_or_bias_params,
+                        'weight_decay': 0.
+                    },
+                    {
+                        'params': rest_params,
+                        'weight_decay': optimizer_hparams['weight_decay']
+                    },
+                ],
+                'lr':
+                optimizer_hparams['lr'],
+                'betas':
+                (optimizer_hparams['beta1'], optimizer_hparams['beta2']),
+                'eps':
+                optimizer_hparams['eps'],
+            }
+            optimizer = build_optimizer(
+                model, cfg=cfg.train.optimizer, default_args=optimizer_args)
+        else:
+            optimizer = optimizers[0]
+
+        if optimizers[1] is None:
+            lr_scheduler = get_schedule(optimizer, cfg.train.lr_scheduler)
+        else:
+            lr_scheduler = optimizers[1]
+        optimizers = (optimizer, lr_scheduler)
+
+        # loss module
+        loss_img = nn.CrossEntropyLoss()
+        loss_txt = nn.CrossEntropyLoss()
+        self.loss_img = loss_img.cuda(int(os.environ.get('LOCAL_RANK', 0)))
+        self.loss_txt = loss_txt.cuda(int(os.environ.get('LOCAL_RANK', 0)))
+        self.loss_cfg = cfg.train.loss_cfg
+
+        # launcher and use_fp16
+        if 'launcher' not in kwargs and cfg.train.get('launcher', None):
+            kwargs['launcher'] = cfg.train.launcher
+        if 'use_fp16' not in kwargs and cfg.train.get('use_fp16', False):
+            kwargs['use_fp16'] = cfg.train.use_fp16
+
+        # preprocessor
+        if preprocessor is None:
+            preprocessor = {
+                ConfigKeys.train:
+                CLIPPreprocessor(
+                    model_dir=work_dir,
+                    mode=ModeKeys.TRAIN,
+                    tokenizer=model.tokenizer,
+                    resolution=model.model_info['image_resolution']),
+                ConfigKeys.val:
+                CLIPPreprocessor(
+                    model_dir=work_dir,
+                    mode=ModeKeys.EVAL,
+                    tokenizer=model.tokenizer,
+                    resolution=model.model_info['image_resolution']),
             }
 
-            train_loader = DataLoader(self.train_dataset, **train_params)
-
-            for batch_idx, (img_tensor, text_str_list,
-                            img_id_list) in enumerate(train_loader):
-                text_info_list = [
-                    self.model.tokenize_text(tmp) for tmp in text_str_list
-                ]
-                text_ids_tensor = torch.cat([tmp[0] for tmp in text_info_list],
-                                            dim=0)
-                text_masks_tensor = torch.cat(
-                    [tmp[1] for tmp in text_info_list], dim=0)
-
-                img_tensor = img_tensor.to(self.device_id, non_blocking=True)
-                img_id_list = img_id_list.to(self.device_id, non_blocking=True)
-                text_ids_tensor = text_ids_tensor.to(
-                    self.device_id, non_blocking=True)
-                text_masks_tensor = text_masks_tensor.to(
-                    self.device_id, non_blocking=True)
-
-                loss = ddp_model((img_tensor, text_ids_tensor,
-                                  text_masks_tensor, img_id_list),
-                                 ModeKeys.TRAIN)
-
-                optimizer.zero_grad()
-                loss.backward()
-                optimizer.step()
-
-                if batch_idx % 10 == 0:
-                    logger.info(
-                        'epoch: {}, train batch {}/{}, loss={:.5f}, logit_scale={:.5f}'
-                        .format(epoch, batch_idx, len(train_loader),
-                                loss.item(),
-                                ddp_model.module.logit_scale.exp().item()))
-            if dist.get_rank() == 0:
-                os.makedirs(self.ckpt_dir, exist_ok=True)
-                torch.save(ddp_model.module.state_dict(),
-                           '{}/epoch{}.pth'.format(self.ckpt_dir, epoch))
-
-    def evaluate(self,
-                 checkpoint_path: Optional[str] = None,
-                 *args,
-                 **kwargs) -> Dict[str, float]:
-        if checkpoint_path is not None:
-            checkpoint_params = torch.load(checkpoint_path, 'cpu')
-            self.model.clip_model.load_state_dict(checkpoint_params)
-        self.model.clip_model.eval()
-        self.model.clip_model.to(self.device_id)
-
-        val_params = {
-            'collate_fn': None,
-            'batch_size': self.val_batch_size,
-            'shuffle': False,
-            'drop_last': False,
-            'num_workers': 8
-        }
-        val_loader = DataLoader(self.val_dataset, **val_params)
-
-        tp_cnt_per_batch = []
-        processed_cnt = 0
-        with torch.no_grad():
-            for batch_idx, (img_tensor, text_str_list,
-                            img_id_list) in enumerate(val_loader):
-                text_info_list = [
-                    self.model.tokenize_text(tmp) for tmp in text_str_list
-                ]
-                text_ids_tensor = torch.cat([tmp[0] for tmp in text_info_list],
-                                            dim=0)
-                text_masks_tensor = torch.cat(
-                    [tmp[1] for tmp in text_info_list], dim=0)
-
-                img_tensor = img_tensor.to(self.device_id, non_blocking=True)
-                img_id_list = img_id_list.to(self.device_id, non_blocking=True)
-                text_ids_tensor = text_ids_tensor.to(
-                    self.device_id, non_blocking=True)
-                text_masks_tensor = text_masks_tensor.to(
-                    self.device_id, non_blocking=True)
-
-                img_feat = self.model.clip_model(img_tensor, input_type='img')
-                text_feat = self.model.clip_model(
-                    (text_ids_tensor, text_masks_tensor), input_type='text')
-
-                sim_mat = text_feat @ img_feat.t()
-                text_cnt, img_cnt = sim_mat.shape
-                top1_scores, match_ids = torch.max(sim_mat, dim=1)
-
-                match_ids = match_ids.int()
-                gt_ids = torch.tensor(range(0, text_cnt)).to(
-                    self.device_id, non_blocking=True).int()
-                error_cnt = torch.nonzero(match_ids - gt_ids)
-                processed_cnt += text_cnt
-
-                tp_cnt_per_batch.append(text_cnt - 1.0 * error_cnt.numel())
-                logger.info('current acc: {:.3f}'.format(
-                    sum(tp_cnt_per_batch) / processed_cnt))
+        # dataset related
+        self.dataset_cfg = cfg.dataset
+        if hasattr(self.dataset_cfg, 'column_map'):
+            # cases where dataset key names are not "img" and "text"
+            img_key_name = getattr(self.dataset_cfg.column_map, 'img', 'img')
+            preprocessor[ConfigKeys.train].set_input_img_key(img_key_name)
+            preprocessor[ConfigKeys.val].set_input_img_key(img_key_name)
+            text_key_name = getattr(self.dataset_cfg.column_map, 'text',
+                                    'text')
+            preprocessor[ConfigKeys.train].set_input_text_key(text_key_name)
+            preprocessor[ConfigKeys.val].set_input_text_key(text_key_name)
+        self.global_batch_size = cfg.train.dataloader.batch_size_per_gpu * world_size
+
+        super().__init__(
+            model=model,
+            cfg_file=cfg_file,
+            arg_parse_fn=arg_parse_fn,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            preprocessor=preprocessor,
+            optimizers=optimizers,
+            seed=seed,
+            **kwargs,
+        )
+
+    def train_step(self, model, inputs):
+        model.train()
+        inputs['mode'] = ModeKeys.TRAIN
+        model_outputs = model.forward(
+            inputs
+        )  # {OutputKeys.IMG_EMBEDDING: Tensor(batch_size, dim), OutputKeys.TEXT_EMBEDDING: Tensor(batch_size, dim)}
+        loss = get_loss(model_outputs, self.loss_img, self.loss_txt,
+                        self.loss_cfg)
+        train_outputs = {'loss': loss}
+        # add model output info to log
+        if 'log_vars' not in train_outputs:
+            default_keys_pattern = ['loss']
+            match_keys = set([])
+            for key_p in default_keys_pattern:
+                match_keys.update(
+                    [key for key in train_outputs.keys() if key_p in key])
+            log_vars = {}
+            for key in match_keys:
+                value = train_outputs.get(key, None)
+                if value is not None:
+                    if dist.is_available() and dist.is_initialized():
+                        value = value.data.clone()
+                        dist.all_reduce(value.div_(dist.get_world_size()))
+                    log_vars.update({key: value.item()})
+            unwrapped_model = getattr(model, 'module', model)
+            log_vars[
+                'logit_scale'] = unwrapped_model.clip_model.logit_scale.data.clone(
+                ).item()  # noqa
+            log_vars['global_batch_size'] = int(self.global_batch_size)
+            self.log_buffer.update(log_vars)
+        else:
+            self.log_buffer.update(train_outputs['log_vars'])
+        self.train_outputs = train_outputs
diff --git a/modelscope/trainers/multi_modal/clip/clip_trainer_utils.py b/modelscope/trainers/multi_modal/clip/clip_trainer_utils.py
index 4e150fe7..fed255de 100644
--- a/modelscope/trainers/multi_modal/clip/clip_trainer_utils.py
+++ b/modelscope/trainers/multi_modal/clip/clip_trainer_utils.py
@@ -1,94 +1,125 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2022 The OFA-Sys Team.
+# All rights reserved.
+# This source code is licensed under the Apache 2.0 license
+# found in the LICENSE file in the root directory.
 
+import math
 import os
-import random
+from functools import partial
+from inspect import unwrap
 
-import json
 import torch
-import torch.nn.functional as F
-from PIL import Image
-from torch.utils.data import Dataset
-from torchvision import transforms
-
-from modelscope.utils.constant import ModeKeys
-
-train_transform = transforms.Compose([
-    transforms.RandomResizedCrop(
-        224, scale=(0.5, 1.0), interpolation=Image.BICUBIC),
-    transforms.RandomApply([transforms.ColorJitter(0.4, 0.4, 0.4, 0.1)],
-                           p=0.8),
-    transforms.RandomGrayscale(p=0.2),
-    transforms.RandomHorizontalFlip(),
-    transforms.ToTensor(),
-    transforms.Normalize((0.48145466, 0.4578275, 0.40821073),
-                         (0.26862954, 0.26130258, 0.27577711))
-])
-
-val_transform = transforms.Compose([
-    transforms.Resize((224, 224), interpolation=Image.BICUBIC),
-    transforms.ToTensor(),
-    transforms.Normalize((0.48145466, 0.4578275, 0.40821073),
-                         (0.26862954, 0.26130258, 0.27577711))
-])
-
-
-class ImageWithCaptionDataset(Dataset):
-
-    def __init__(self, json_file, img_dir, phase):
-        self.annotations = json.load(open(json_file))
-        self.img_dir = img_dir
-        if phase == ModeKeys.TRAIN:
-            self.transform = train_transform
-        elif phase == ModeKeys.EVAL:
-            self.transform = val_transform
-
-        self.img_name2img_id = {}
-        for anno_dict in self.annotations:
-            img_name = anno_dict['image']
-            if img_name not in self.img_name2img_id:
-                self.img_name2img_id[img_name] = len(self.img_name2img_id)
-
-    def __len__(self):
-        return len(self.annotations)
-
-    def __getitem__(self, index):
-        anno_dict = self.annotations[index]
-
-        img_path = os.path.join(self.img_dir, anno_dict['image'])
-        img_pil = Image.open(img_path).convert('RGB')
-        img_th = self.transform(img_pil)
-        img_id = self.img_name2img_id[anno_dict['image']]
-
-        text_str = random.choice(anno_dict['caption'])
-
-        return img_th, text_str, img_id
-
-
-def get_params_groups(ddp_model, weight_decay):
-    decay = []
-    no_decay = []
-    for name, param in ddp_model.named_parameters():
-        if not param.requires_grad:
-            continue
-        if len(param.shape) == 1 or name.endswith('.bias'):
-            no_decay.append(param)
-        else:
-            decay.append(param)
-    params_groups = [{
-        'params': no_decay,
-        'weight_decay': 0.
-    }, {
-        'params': decay,
-        'weight_decay': weight_decay
-    }]
-    return params_groups
-
-
-def get_optimizer(ddp_model):
-    from torch.optim import AdamW
-    lr_init = 1e-5
-    betas = [0.9, 0.999]
-    weight_decay = 0.02
-    params_groups = get_params_groups(ddp_model, weight_decay=weight_decay)
-    return AdamW(
-        params_groups, lr=lr_init, betas=betas, weight_decay=weight_decay)
+import torch.distributed as dist
+from torch.optim.lr_scheduler import LambdaLR
+
+from modelscope.outputs import OutputKeys
+
+
+def get_optimizer_params(model_name, cfg):
+    # get default params
+    # Params from paper (https://arxiv.org/pdf/2103.00020.pdf)
+    # base model
+    if model_name in ['damo/multi-modal_clip-vit-base-patch16_zh']:
+        params = {
+            'lr': 5.0e-4,
+            'beta1': 0.9,
+            'beta2': 0.98,
+            'eps': 1.0e-6,
+            'weight_decay': 0.0
+        }
+    # large models
+    elif model_name in [
+            'damo/multi-modal_clip-vit-large-patch14_zh',
+            'damo/multi-modal_clip-vit-large-patch14_336_zh'
+    ]:
+        params = {
+            'lr': 4.0e-4,
+            'beta1': 0.9,
+            'beta2': 0.98,
+            'eps': 1.0e-6,
+            'weight_decay': 0.0
+        }
+    else:
+        params = {
+            'lr': 5.0e-4,
+            'beta1': 0.9,
+            'beta2': 0.999,
+            'eps': 1.0e-8,
+            'weight_decay': 0.0
+        }
+    # override with config params
+    for key in ['lr', 'beta1', 'beta2', 'eps', 'weight_decay']:
+        if hasattr(cfg.train, 'optimizer_hparams'):
+            params[key] = getattr(cfg.train.optimizer_hparams, key,
+                                  params[key])
+    return params
+
+
+def get_loss(model_outputs, loss_img, loss_txt, loss_cfg):
+    image_features = model_outputs[OutputKeys.IMG_EMBEDDING]
+    text_features = model_outputs[OutputKeys.TEXT_EMBEDDING]
+    logit_scale = model_outputs['logit_scale']
+    logit_scale = logit_scale.mean()
+    if loss_cfg.aggregate and int(os.environ.get('WORLD_SIZE', 1)) > 1:
+        world_size = dist.get_world_size()
+        rank = dist.get_rank()
+
+        # We gather tensors from all gpus to get more negatives to contrast with.
+        gathered_image_features = [
+            torch.zeros_like(image_features) for _ in range(world_size)
+        ]
+        gathered_text_features = [
+            torch.zeros_like(text_features) for _ in range(world_size)
+        ]
+        dist.all_gather(gathered_image_features, image_features)
+        dist.all_gather(gathered_text_features, text_features)
+
+        all_image_features = torch.cat([image_features]
+                                       + gathered_image_features[:rank]
+                                       + gathered_image_features[rank + 1:])
+        all_text_features = torch.cat([text_features]
+                                      + gathered_text_features[:rank]
+                                      + gathered_text_features[rank + 1:])
+
+        # this is needed to send gradients back everywhere.
+        logits_per_image = logit_scale * all_image_features @ all_text_features.t(
+        )
+        logits_per_text = logits_per_image.t()
+
+    else:
+        logits_per_image = logit_scale * image_features @ text_features.t()
+        logits_per_text = logit_scale * text_features @ image_features.t()
+
+    ground_truth = torch.arange(len(logits_per_image)).long()
+    ground_truth = ground_truth.cuda(
+        int(os.environ.get('LOCAL_RANK', 0)), non_blocking=True)
+
+    total_loss = (loss_img(logits_per_image, ground_truth)
+                  + loss_txt(logits_per_text, ground_truth)) / 2
+
+    return total_loss
+
+
+def lr_lambda(num_warmup_steps, num_training_steps, num_cycles, current_step):
+    if current_step < num_warmup_steps:
+        return float(current_step) / float(max(1, num_warmup_steps))
+    progress = float(current_step - num_warmup_steps) / float(
+        max(1, num_training_steps - num_warmup_steps))
+    return max(
+        0.0,
+        0.5 *  # noqa
+        (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))  # noqa
+
+
+def get_schedule(optimizer,
+                 scheduler,
+                 num_cycles: float = 0.5,
+                 last_epoch: int = -1):
+    num_warmup_steps = int(scheduler.warmup_proportion
+                           * scheduler.num_train_steps)
+    num_training_steps = scheduler.num_train_steps
+
+    return LambdaLR(
+        optimizer,
+        partial(lr_lambda, num_warmup_steps, num_training_steps, num_cycles),
+        last_epoch)
diff --git a/tests/pipelines/test_multi_modal_embedding.py b/tests/pipelines/test_multi_modal_embedding.py
index ee9cdb1f..7eddc690 100644
--- a/tests/pipelines/test_multi_modal_embedding.py
+++ b/tests/pipelines/test_multi_modal_embedding.py
@@ -24,7 +24,7 @@ class MultiModalEmbeddingTest(unittest.TestCase, DemoCompatibilityCheck):
     def test_run(self):
         pipeline_multi_modal_embedding = pipeline(
             Tasks.multi_modal_embedding, model=self.model_id)
-        text_embedding = pipeline_multi_modal_embedding(
+        text_embedding = pipeline_multi_modal_embedding.forward(
             self.test_input)[OutputKeys.TEXT_EMBEDDING]
         print('l1-norm: {}'.format(
             torch.norm(text_embedding, p=1, dim=-1).item()))
@@ -36,7 +36,7 @@ class MultiModalEmbeddingTest(unittest.TestCase, DemoCompatibilityCheck):
         model = Model.from_pretrained(self.model_id)
         pipeline_multi_modal_embedding = pipeline(
             task=Tasks.multi_modal_embedding, model=model)
-        text_embedding = pipeline_multi_modal_embedding(
+        text_embedding = pipeline_multi_modal_embedding.forward(
             self.test_input)[OutputKeys.TEXT_EMBEDDING]
         print('l1-norm: {}'.format(
             torch.norm(text_embedding, p=1, dim=-1).item()))
@@ -47,7 +47,7 @@ class MultiModalEmbeddingTest(unittest.TestCase, DemoCompatibilityCheck):
     def test_run_with_default_model(self):
         pipeline_multi_modal_embedding = pipeline(
             task=Tasks.multi_modal_embedding)
-        text_embedding = pipeline_multi_modal_embedding(
+        text_embedding = pipeline_multi_modal_embedding.forward(
             self.test_input)[OutputKeys.TEXT_EMBEDDING]
         print('l1-norm: {}'.format(
             torch.norm(text_embedding, p=1, dim=-1).item()))
diff --git a/tests/trainers/test_clip_trainer.py b/tests/trainers/test_clip_trainer.py
new file mode 100644
index 00000000..e460f1ac
--- /dev/null
+++ b/tests/trainers/test_clip_trainer.py
@@ -0,0 +1,83 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import shutil
+import unittest
+
+import json
+
+from modelscope.metainfo import Metrics, Trainers
+from modelscope.msdatasets import MsDataset
+from modelscope.trainers import build_trainer
+from modelscope.utils.constant import ModelFile
+from modelscope.utils.test_utils import test_level
+
+
+class TestClipTrainer(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.finetune_cfg = \
+            {'framework': 'pytorch',
+             'task': 'multi-modal-embedding',
+             'pipeline': {'type': 'multi-modal-embedding'},
+             'pretrained_model': {'model_name': 'damo/multi-modal_clip-vit-base-patch16_zh'},
+             'dataset': {'column_map': {'img': 'image', 'text': 'query'}},
+             'train': {'work_dir': './workspace/ckpts/clip',
+                       # 'launcher': 'pytorch',
+                       'max_epochs': 1,
+                       'use_fp16': True,
+                       'dataloader': {'batch_size_per_gpu': 8,
+                                      'workers_per_gpu': 0,
+                                      'shuffle': True,
+                                      'drop_last': True},
+                       'lr_scheduler': {'name': 'cosine',
+                                        'warmup_proportion': 0.01},
+                       'lr_scheduler_hook': {'type': 'LrSchedulerHook', 'by_epoch': False},
+                       'optimizer': {'type': 'AdamW'},
+                       'optimizer_hparams': {'lr': 5e-05, 'weight_decay': 0.01},
+                       'optimizer_hook': {'type': 'TorchAMPOptimizerHook',
+                                          'cumulative_iters': 1,
+                                          'loss_keys': 'loss'},
+                       'loss_cfg': {'aggregate': True},
+                       'hooks': [{'type': 'BestCkptSaverHook',
+                                  'metric_key': 'inbatch_t2i_recall_at_1',
+                                  'interval': 100},
+                                 {'type': 'TextLoggerHook', 'interval': 1},
+                                 {'type': 'IterTimerHook'},
+                                 {'type': 'EvaluationHook', 'by_epoch': True, 'interval': 1},
+                                 {'type': 'ClipClampLogitScaleHook'}]},
+             'evaluation': {'dataloader': {'batch_size_per_gpu': 8,
+                                           'workers_per_gpu': 0,
+                                           'shuffle': True,
+                                           'drop_last': True},
+                            'metrics': [{'type': 'inbatch_recall'}]},
+             'preprocessor': []}
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_trainer_std(self):
+        WORKSPACE = './workspace/ckpts/clip'
+        os.makedirs(WORKSPACE, exist_ok=True)
+        config_file = os.path.join(WORKSPACE, ModelFile.CONFIGURATION)
+        with open(config_file, 'w') as writer:
+            json.dump(self.finetune_cfg, writer)
+
+        pretrained_model = 'damo/multi-modal_clip-vit-base-patch16_zh'
+        args = dict(
+            model=pretrained_model,
+            work_dir=WORKSPACE,
+            train_dataset=MsDataset.load(
+                'muge', namespace='modelscope', split='train[:200]'),
+            eval_dataset=MsDataset.load(
+                'muge', namespace='modelscope', split='validation[:100]'),
+            metrics=[Metrics.inbatch_recall],
+            cfg_file=config_file)
+        trainer = build_trainer(
+            name=Trainers.clip_multi_modal_embedding, default_args=args)
+        trainer.train()
+
+        self.assertIn(ModelFile.TORCH_MODEL_BIN_FILE,
+                      os.listdir(os.path.join(WORKSPACE, 'output')))
+        shutil.rmtree(WORKSPACE)
+
+
+if __name__ == '__main__':
+    unittest.main()

From ce08cfbea862fe097c07d9646ba3bf380eef4467 Mon Sep 17 00:00:00 2001
From: "yuanzheng.yuanzhen" <yuanzheng.yuanzhen@alibaba-inc.com>
Date: Mon, 31 Oct 2022 18:47:06 +0800
Subject: [PATCH 12/46] [to #42322933]Add licenses         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10580553

    * Add licenses
---
 modelscope/models/science/unifold/dataset.py | 3 +++
 modelscope/models/science/unifold/model.py   | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/modelscope/models/science/unifold/dataset.py b/modelscope/models/science/unifold/dataset.py
index 05803f2c..29e1a8b0 100644
--- a/modelscope/models/science/unifold/dataset.py
+++ b/modelscope/models/science/unifold/dataset.py
@@ -1,3 +1,6 @@
+# The Uni-fold implementation is also open-sourced by the authors under Apache-2.0 license,
+# and is publicly available at https://github.com/dptech-corp/Uni-Fold.
+
 import copy
 import logging
 import os
diff --git a/modelscope/models/science/unifold/model.py b/modelscope/models/science/unifold/model.py
index 6632751a..7f28f18d 100644
--- a/modelscope/models/science/unifold/model.py
+++ b/modelscope/models/science/unifold/model.py
@@ -1,3 +1,6 @@
+# The Uni-fold implementation is also open-sourced by the authors under Apache-2.0 license,
+# and is publicly available at https://github.com/dptech-corp/Uni-Fold.
+
 import argparse
 import os
 from typing import Any

From 64868bf2ad65308be1372e2c88f0133daf39d6a9 Mon Sep 17 00:00:00 2001
From: "xiaodongdeng.dxd" <xiaodongdeng.dxd@alibaba-inc.com>
Date: Mon, 31 Oct 2022 20:42:56 +0800
Subject: [PATCH 13/46] =?UTF-8?q?[to=20#42322933]=E5=A4=9A=E6=A8=A1?=
 =?UTF-8?q?=E6=80=81=E9=A2=84=E8=AE=AD=E7=BB=83=E6=A8=A1=E5=9E=8BOFA?=
 =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E6=94=AF=E6=8C=816b=E6=A8=A1=E5=9E=8B?=
 =?UTF-8?q?=E7=9A=84feature?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

多模态预训练模型OFA增加支持6b模型的feature
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10574571
---
 .../multi_modal/ofa/configuration_ofa.py      |  13 +
 .../models/multi_modal/ofa/modeling_ofa.py    | 344 +++++++++++-------
 .../models/multi_modal/ofa/utils/utils.py     |  40 ++
 modelscope/models/multi_modal/ofa/vit.py      | 155 ++++++++
 .../models/multi_modal/ofa_for_all_tasks.py   |   7 +-
 5 files changed, 416 insertions(+), 143 deletions(-)
 mode change 100755 => 100644 modelscope/models/multi_modal/ofa/modeling_ofa.py
 create mode 100644 modelscope/models/multi_modal/ofa/vit.py

diff --git a/modelscope/models/multi_modal/ofa/configuration_ofa.py b/modelscope/models/multi_modal/ofa/configuration_ofa.py
index 4899f416..2edc651e 100644
--- a/modelscope/models/multi_modal/ofa/configuration_ofa.py
+++ b/modelscope/models/multi_modal/ofa/configuration_ofa.py
@@ -136,6 +136,12 @@ class OFAConfig(PretrainedConfig):
                  entangle_position_embedding=False,
                  interpolate_position=False,
                  orig_patch_image_size=224,
+                 share_attn_bias=False,
+                 use_image_feature=True,
+                 disable_entangle=False,
+                 use_ofasys=False,
+                 vit_type='vit_base',
+                 vit_drop_path_rate=0.0,
                  **kwargs):
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
@@ -178,6 +184,13 @@ class OFAConfig(PretrainedConfig):
         self.interpolate_position = interpolate_position
         self.orig_patch_image_size = orig_patch_image_size
 
+        self.share_attn_bias = share_attn_bias
+        self.use_image_feature = use_image_feature
+        self.disable_entangle = disable_entangle
+        self.use_ofasys = use_ofasys
+        self.vit_type = vit_type
+        self.vit_drop_path_rate = vit_drop_path_rate
+
         super().__init__(
             pad_token_id=pad_token_id,
             bos_token_id=bos_token_id,
diff --git a/modelscope/models/multi_modal/ofa/modeling_ofa.py b/modelscope/models/multi_modal/ofa/modeling_ofa.py
old mode 100755
new mode 100644
index 0a7a2ce6..69005ef0
--- a/modelscope/models/multi_modal/ofa/modeling_ofa.py
+++ b/modelscope/models/multi_modal/ofa/modeling_ofa.py
@@ -35,6 +35,8 @@ from transformers.utils import logging
 from .configuration_ofa import OFAConfig
 from .generate import utils
 from .resnet import ResNet
+from .utils.utils import DropPath
+from .vit import vit_base, vit_huge, vit_large, vit_large_336
 
 logger = logging.get_logger(__name__)
 
@@ -249,45 +251,6 @@ class LayerDropModuleList(nn.ModuleList):
                 yield m
 
 
-def drop_path(x, drop_prob: float = 0.0, training: bool = False):
-    r"""
-    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
-
-    Args:
-        x (`nn.Modules`): input nn layers.
-        drop_prob (`float`): drop path ratio.
-        training (`bool`): whether is training or inference.
-    """
-    if drop_prob == 0.0 or not training:
-        return x
-    keep_prob = 1 - drop_prob
-    shape = (1, x.shape[1], 1)
-    random_tensor = keep_prob + torch.rand(
-        shape, dtype=x.dtype, device=x.device)
-    random_tensor.floor_()  # binarize
-    output = x.div(keep_prob) * random_tensor
-    return output
-
-
-class DropPath(nn.Module):
-    r"""
-    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
-
-    Args:
-        drop_prob: drop path ratio.
-    """
-
-    def __init__(self, drop_prob=None):
-        super().__init__()
-        self.drop_prob = drop_prob
-
-    def forward(self, x):
-        return drop_path(x, self.drop_prob, self.training)
-
-    def extra_repr(self) -> str:
-        return 'p={}'.format(self.drop_prob)
-
-
 class OFAAttention(nn.Module):
     r"""
     Multi-headed attention, with additional implementation for NormFormer.
@@ -898,31 +861,49 @@ class OFAEncoder(OFAPreTrainedModel):
                                              self.padding_idx)
 
         if config.add_type_embedding:
-            self.type_embedding = Embedding(2, embed_dim, padding_idx=None)
+            if config.use_image_feature:
+                self.type_embedding = Embedding(2, embed_dim, padding_idx=None)
+            else:
+                self.type_embedding = Embedding(1, embed_dim, padding_idx=None)
         else:
             self.type_embedding = None
 
-        if config.resnet_type == 'resnet18':
-            self.embed_images = ResNet(
-                [2, 2, 2], drop_path_rate=config.resnet_drop_path_rate)
-        elif config.resnet_type == 'resnet34':
-            self.embed_images = ResNet(
-                [3, 4, 6], drop_path_rate=config.resnet_drop_path_rate)
-        elif config.resnet_type == 'resnet50':
-            self.embed_images = ResNet(
-                [3, 4, 6], drop_path_rate=config.resnet_drop_path_rate)
-        elif config.resnet_type == 'resnet101':
-            self.embed_images = ResNet(
-                [3, 4, 23], drop_path_rate=config.resnet_drop_path_rate)
-        elif config.resnet_type == 'resnet152':
-            self.embed_images = ResNet(
-                [3, 8, 36], drop_path_rate=config.resnet_drop_path_rate)
-        else:
-            raise NotImplementedError
+        if config.use_image_feature:
+            if config.use_ofasys:
+                vit_backbone = {
+                    'vit_base': vit_base,
+                    'vit_large': vit_large,
+                    'vit_large_336': vit_large_336,
+                    'vit_huge': vit_huge,
+                }[config.vit_type]
+                self.embed_images = vit_backbone(config.vit_drop_path_rate)
 
-        self.image_proj = Linear(1024, embed_dim)
+                self.image_proj = Linear(self.embed_images.width, embed_dim)
 
-        if config.resnet_model_path:
+            else:
+                if config.resnet_type == 'resnet18':
+                    self.embed_images = ResNet(
+                        [2, 2, 2], drop_path_rate=config.resnet_drop_path_rate)
+                elif config.resnet_type == 'resnet34':
+                    self.embed_images = ResNet(
+                        [3, 4, 6], drop_path_rate=config.resnet_drop_path_rate)
+                elif config.resnet_type == 'resnet50':
+                    self.embed_images = ResNet(
+                        [3, 4, 6], drop_path_rate=config.resnet_drop_path_rate)
+                elif config.resnet_type == 'resnet101':
+                    self.embed_images = ResNet(
+                        [3, 4, 23],
+                        drop_path_rate=config.resnet_drop_path_rate)
+                elif config.resnet_type == 'resnet152':
+                    self.embed_images = ResNet(
+                        [3, 8, 36],
+                        drop_path_rate=config.resnet_drop_path_rate)
+                else:
+                    raise NotImplementedError
+
+                self.image_proj = Linear(1024, embed_dim)
+
+        if not config.use_ofasys and config.resnet_model_path:
             print('load resnet {}'.format(config.resnet_model_path))
             resnet_state_dict = torch.load(config.resnet_model_path)
             self.embed_images.load_state_dict(resnet_state_dict)
@@ -933,14 +914,21 @@ class OFAEncoder(OFAPreTrainedModel):
 
         self.embed_positions = Embedding(self.max_source_positions + 2,
                                          embed_dim)
-        self.embed_image_positions = Embedding(config.image_bucket_size**2 + 1,
-                                               embed_dim)
-        self.pos_ln = LayerNorm(embed_dim)
-        self.image_pos_ln = LayerNorm(embed_dim)
+
+        if config.use_image_feature:
+            self.embed_image_positions = Embedding(
+                config.image_bucket_size**2 + 1, embed_dim)
+        if not config.use_ofasys:
+            self.pos_ln = LayerNorm(embed_dim)
+
+        if config.use_image_feature:
+            self.image_pos_ln = LayerNorm(embed_dim)
         self.pos_scaling = float(embed_dim / self.num_attention_heads
                                  * config.attn_scale_factor)**-0.5
-        self.pos_q_linear = nn.Linear(embed_dim, embed_dim)
-        self.pos_k_linear = nn.Linear(embed_dim, embed_dim)
+
+        if not (config.use_ofasys and config.entangle_position_embedding):
+            self.pos_q_linear = nn.Linear(embed_dim, embed_dim)
+            self.pos_k_linear = nn.Linear(embed_dim, embed_dim)
 
         if self.encoder_layerdrop > 0.0:
             self.layers = LayerDropModuleList(p=self.encoder_layerdrop)
@@ -965,22 +953,28 @@ class OFAEncoder(OFAPreTrainedModel):
         self.token_bucket_size = config.token_bucket_size
         token_num_rel_dis = 2 * config.token_bucket_size - 1
         token_rp_bucket = make_token_bucket_position(config.token_bucket_size)
+        self.share_attn_bias = config.share_attn_bias
+        num_rel_pos_tables = 1 if config.share_attn_bias else config.encoder_layers
         self.token_rel_pos_table_list = nn.ModuleList([
             Embedding(
                 token_num_rel_dis, self.num_attention_heads, zero_init=True)
-            for _ in range(config.encoder_layers)
+            for _ in range(num_rel_pos_tables)
         ])
 
-        self.image_bucket_size = config.image_bucket_size
-        image_num_rel_dis = (2 * config.image_bucket_size
-                             - 1) * (2 * config.image_bucket_size - 1) + 3
-        image_rp_bucket = make_image_bucket_position(config.image_bucket_size,
-                                                     image_num_rel_dis)
-        self.image_rel_pos_table_list = nn.ModuleList([
-            Embedding(
-                image_num_rel_dis, self.num_attention_heads, zero_init=True)
-            for _ in range(config.encoder_layers)
-        ])
+        if config.use_image_feature:
+            self.image_bucket_size = config.image_bucket_size
+            image_num_rel_dis = (2 * config.image_bucket_size
+                                 - 1) * (2 * config.image_bucket_size - 1) + 3
+            image_rp_bucket = make_image_bucket_position(
+                config.image_bucket_size, image_num_rel_dis)
+            self.image_rel_pos_table_list = nn.ModuleList([
+                Embedding(
+                    image_num_rel_dis,
+                    self.num_attention_heads,
+                    zero_init=True) for _ in range(num_rel_pos_tables)
+            ])
+
+            self.register_buffer('image_rp_bucket', image_rp_bucket)
 
         if config.layernorm_embedding:
             self.layernorm_embedding = LayerNorm(embed_dim)
@@ -988,12 +982,12 @@ class OFAEncoder(OFAPreTrainedModel):
             self.layernorm_embedding = None
 
         self.register_buffer('token_rp_bucket', token_rp_bucket)
-        self.register_buffer('image_rp_bucket', image_rp_bucket)
         self.entangle_position_embedding = config.entangle_position_embedding
 
         self.gradient_checkpointing = False
         # Initialize weights and apply final processing
         self.post_init()
+        self.use_ofasys = config.use_ofasys
 
     def get_input_embeddings(self):
         r"""
@@ -1305,21 +1299,41 @@ class OFAEncoder(OFAPreTrainedModel):
         if has_pads:
             x = x * (1 - encoder_padding_mask.unsqueeze(-1).type_as(x))
 
-        pos_embed = self.pos_ln(pos_embed)
-        if patch_images is not None:
-            image_pos_embed = self.image_pos_ln(image_pos_embed)
-            pos_embed = torch.cat([image_pos_embed, pos_embed], dim=1)
-        if patch_images_2 is not None:
-            image_pos_embed_2 = self.image_pos_ln(image_pos_embed_2)
-            pos_embed = torch.cat([image_pos_embed_2, pos_embed], dim=1)
+        if self.use_ofasys:
+            if patch_images is not None:
+                pos_embed = torch.cat([image_pos_embed, pos_embed], dim=1)
+            if patch_images_2 is not None:
+                pos_embed = torch.cat([image_pos_embed_2, pos_embed], dim=1)
+        else:
+            pos_embed = self.pos_ln(pos_embed)
+            if patch_images is not None:
+                image_pos_embed = self.image_pos_ln(image_pos_embed)
+                pos_embed = torch.cat([image_pos_embed, pos_embed], dim=1)
+            if patch_images_2 is not None:
+                image_pos_embed_2 = self.image_pos_ln(image_pos_embed_2)
+                pos_embed = torch.cat([image_pos_embed_2, pos_embed], dim=1)
+
+        def build_abs_pos_bias(pos_embed):
+            batch_size, seq_length = pos_embed.size(0), pos_embed.size(1)
+            if not (self.use_ofasys and self.entangle_position_embedding):
+                pos_q = self.pos_q_linear(pos_embed).view(
+                    batch_size, seq_length, self.num_attention_heads,
+                    -1).transpose(1, 2) * self.pos_scaling
+                pos_k = self.pos_k_linear(pos_embed).view(
+                    batch_size, seq_length, self.num_attention_heads,
+                    -1).transpose(1, 2)
+                abs_pos_bias = torch.matmul(pos_q, pos_k.transpose(2, 3))
+            else:
+                abs_pos_bias = torch.zeros(
+                    batch_size,
+                    self.num_attention_heads,
+                    seq_length,
+                    seq_length,
+                    dtype=pos_embed.dtype,
+                    device=pos_embed.device)
+            return abs_pos_bias
 
-        pos_q = self.pos_q_linear(pos_embed).view(
-            x.size(0), x.size(1), self.num_attention_heads, -1).transpose(
-                1, 2) * self.pos_scaling
-        pos_k = self.pos_k_linear(pos_embed).view(
-            x.size(0), x.size(1), self.num_attention_heads,
-            -1).transpose(1, 2)
-        abs_pos_bias = torch.matmul(pos_q, pos_k.transpose(2, 3))
+        abs_pos_bias = build_abs_pos_bias(pos_embed)
 
         # expand attention_mask
         if has_pads:
@@ -1334,19 +1348,22 @@ class OFAEncoder(OFAPreTrainedModel):
             if output_hidden_states:
                 encoder_states += (x, )
             self_attn_bias = abs_pos_bias.clone()
+
+            real_idx = 0 if self.share_attn_bias else idx
+
             self_attn_bias[:, :, -input_ids.size(1):,
                            -input_ids.size(1):] += self.get_rel_pos_bias(
-                               input_ids, idx)
+                               input_ids, real_idx)
             if patch_images_2 is not None:
                 self_attn_bias[:, :, :image_num_patches_2, :image_num_patches_2] += \
-                    self.get_image_rel_pos_bias(image_position_ids_2, idx)
+                    self.get_image_rel_pos_bias(image_position_ids_2, real_idx)
                 self_attn_bias[:, :,
                     image_num_patches_2:image_num_patches_2 + image_num_patches, # noqa
                     image_num_patches_2:image_num_patches_2 + image_num_patches] += \
-                    self.get_image_rel_pos_bias(image_position_ids, idx) # noqa
+                    self.get_image_rel_pos_bias(image_position_ids, real_idx) # noqa
             elif patch_images is not None:
                 self_attn_bias[:, :, :x.size(1) - input_ids.size(1), :x.size(1) - input_ids.size(1)] += \
-                    self.get_image_rel_pos_bias(image_position_ids, idx)
+                    self.get_image_rel_pos_bias(image_position_ids, real_idx)
             self_attn_bias = self_attn_bias.reshape(-1, x.size(1), x.size(1))
 
             hidden_outputs = layer(
@@ -1398,6 +1415,8 @@ class OFADecoder(OFAPreTrainedModel):
         self._future_mask = torch.empty(0)
         self.share_input_output_embed = config.share_decoder_input_output_embed
         self.num_attention_heads = config.decoder_attention_heads
+        self.use_ofasys = config.use_ofasys
+        self.disable_entangle = config.disable_entangle
 
         if embed_tokens is not None:
             self.embed_tokens = embed_tokens
@@ -1415,18 +1434,31 @@ class OFADecoder(OFAPreTrainedModel):
         else:
             self.layernorm_embedding = None
 
+        if config.use_ofasys:
+            if config.add_type_embedding:
+                self.type_embedding = Embedding(
+                    1, self.embed_dim, padding_idx=None)
+            else:
+                self.type_embedding = None
+
         self.window_size = config.code_image_size // 8
 
         self.embed_positions = Embedding(self.max_target_positions + 2,
                                          self.embed_dim)
-        self.embed_image_positions = Embedding(config.image_bucket_size**2 + 1,
-                                               self.embed_dim)
-        self.pos_ln = LayerNorm(self.embed_dim)
-        self.image_pos_ln = LayerNorm(self.embed_dim)
+
+        if not config.use_ofasys:
+            self.embed_image_positions = Embedding(
+                config.image_bucket_size**2 + 1, self.embed_dim)
+        if not config.use_ofasys:
+            self.pos_ln = LayerNorm(self.embed_dim)
+            self.image_pos_ln = LayerNorm(self.embed_dim)
         self.pos_scaling = float(self.embed_dim / self.num_attention_heads
                                  * config.attn_scale_factor)**-0.5
-        self.self_pos_q_linear = nn.Linear(self.embed_dim, self.embed_dim)
-        self.self_pos_k_linear = nn.Linear(self.embed_dim, self.embed_dim)
+
+        if not (config.use_ofasys and config.entangle_position_embedding):
+            self.self_pos_q_linear = nn.Linear(self.embed_dim, self.embed_dim)
+            self.self_pos_k_linear = nn.Linear(self.embed_dim, self.embed_dim)
+
         self.cross_pos_q_linear = nn.Linear(self.embed_dim, self.embed_dim)
         self.cross_pos_k_linear = nn.Linear(self.embed_dim, self.embed_dim)
 
@@ -1463,33 +1495,41 @@ class OFADecoder(OFAPreTrainedModel):
         self.token_bucket_size = config.token_bucket_size
         token_num_rel_dis = 2 * config.token_bucket_size - 1
         token_rp_bucket = make_token_bucket_position(config.token_bucket_size)
+
+        self.share_attn_bias = config.share_attn_bias
+        num_rel_pos_tables = 1 if config.share_attn_bias else config.decoder_layers
         self.token_rel_pos_table_list = nn.ModuleList([
             Embedding(
                 token_num_rel_dis, self.num_attention_heads, zero_init=True)
-            for _ in range(config.decoder_layers)
+            for _ in range(num_rel_pos_tables)
         ])
 
-        self.image_bucket_size = config.image_bucket_size
-        image_num_rel_dis = (2 * config.image_bucket_size
-                             - 1) * (2 * config.image_bucket_size - 1) + 3
-        image_rp_bucket = make_image_bucket_position(config.image_bucket_size,
-                                                     image_num_rel_dis)
-        image_position_idx = torch.arange(self.window_size).unsqueeze(0).expand(self.window_size, self.window_size) + \
-                             torch.arange(self.window_size).unsqueeze(1) * config.image_bucket_size + 1 # noqa
-        image_position_idx = torch.cat(
-            [torch.tensor([0]), image_position_idx.view(-1)])
-        image_position_idx = torch.cat(
-            [image_position_idx,
-             torch.tensor([1024] * 768)])
-        self.image_rel_pos_table_list = nn.ModuleList([
-            Embedding(
-                image_num_rel_dis, self.num_attention_heads, zero_init=True)
-            for _ in range(config.decoder_layers)
-        ])
+        if config.use_image_feature:
+            if not config.use_ofasys:
+                self.image_bucket_size = config.image_bucket_size
+                image_num_rel_dis = (2 * config.image_bucket_size - 1) * (
+                    2 * config.image_bucket_size - 1) + 3
+                image_rp_bucket = make_image_bucket_position(
+                    config.image_bucket_size, image_num_rel_dis)
+                image_position_idx = torch.arange(self.window_size).unsqueeze(0).expand(self.window_size, self.window_size) + \
+                                     torch.arange(self.window_size).unsqueeze(1) * config.image_bucket_size + 1 # noqa
+                image_position_idx = torch.cat(
+                    [torch.tensor([0]),
+                     image_position_idx.view(-1)])
+                image_position_idx = torch.cat(
+                    [image_position_idx,
+                     torch.tensor([1024] * 768)])
+                self.register_buffer('image_position_idx', image_position_idx)
+
+                self.image_rel_pos_table_list = nn.ModuleList([
+                    Embedding(
+                        image_num_rel_dis,
+                        self.num_attention_heads,
+                        zero_init=True) for _ in range(num_rel_pos_tables)
+                ])
+                self.register_buffer('image_rp_bucket', image_rp_bucket)
 
         self.register_buffer('token_rp_bucket', token_rp_bucket)
-        self.register_buffer('image_rp_bucket', image_rp_bucket)
-        self.register_buffer('image_position_idx', image_position_idx)
         self.entangle_position_embedding = config.entangle_position_embedding
 
         self.gradient_checkpointing = False
@@ -1556,26 +1596,46 @@ class OFADecoder(OFAPreTrainedModel):
 
         batch_size = tgt_pos_embed.size(0)
         tgt_len = tgt_pos_embed.size(1)
-        tgt_pos_embed = self.image_pos_ln(
-            tgt_pos_embed) if use_image else self.pos_ln(tgt_pos_embed)
+        if not self.use_ofasys:
+            tgt_pos_embed = self.image_pos_ln(
+                tgt_pos_embed) if use_image else self.pos_ln(tgt_pos_embed)
 
         if src_pos_embed is not None:
             src_len = src_pos_embed.size(1)
-            pos_q = self.cross_pos_q_linear(tgt_pos_embed).view(
-                batch_size, tgt_len, self.num_attention_heads, -1).transpose(
-                    1, 2) * self.pos_scaling
-            pos_k = self.cross_pos_k_linear(src_pos_embed).view(
-                batch_size, src_len, self.num_attention_heads,
-                -1).transpose(1, 2)
+            if not (self.entangle_position_embedding and self.use_ofasys):
+                pos_q = self.cross_pos_q_linear(tgt_pos_embed).view(
+                    batch_size, tgt_len, self.num_attention_heads,
+                    -1).transpose(1, 2) * self.pos_scaling
+                pos_k = self.cross_pos_k_linear(src_pos_embed).view(
+                    batch_size, src_len, self.num_attention_heads,
+                    -1).transpose(1, 2)
+                abs_pos_bias = torch.matmul(pos_q, pos_k.transpose(2, 3))
+            else:
+                abs_pos_bias = torch.zeros(
+                    batch_size,
+                    self.num_attention_heads,
+                    tgt_len,
+                    src_len,
+                    dtype=tgt_pos_embed.dtype,
+                    device=tgt_pos_embed.device)
         else:
-            src_len = tgt_pos_embed.size(1)
-            pos_q = self.self_pos_q_linear(tgt_pos_embed).view(
-                batch_size, tgt_len, self.num_attention_heads, -1).transpose(
-                    1, 2) * self.pos_scaling
-            pos_k = self.self_pos_k_linear(tgt_pos_embed).view(
-                batch_size, src_len, self.num_attention_heads,
-                -1).transpose(1, 2)
-        abs_pos_bias = torch.matmul(pos_q, pos_k.transpose(2, 3))
+            # batch_size, seq_length = tgt_pos_embed.size(0), tgt_pos_embed.size(1)
+            if not (self.entangle_position_embedding and self.use_ofasys):
+                pos_q = self.self_pos_q_linear(tgt_pos_embed).view(
+                    batch_size, tgt_len, self.num_attention_heads,
+                    -1).transpose(1, 2) * self.pos_scaling
+                pos_k = self.self_pos_k_linear(tgt_pos_embed).view(
+                    batch_size, tgt_len, self.num_attention_heads,
+                    -1).transpose(1, 2)
+                abs_pos_bias = torch.matmul(pos_q, pos_k.transpose(2, 3))
+            else:
+                abs_pos_bias = torch.zeros(
+                    batch_size,
+                    self.num_attention_heads,
+                    tgt_len,
+                    tgt_len,
+                    dtype=tgt_pos_embed.dtype,
+                    device=tgt_pos_embed.device)
 
         return abs_pos_bias
 
@@ -1809,17 +1869,18 @@ class OFADecoder(OFAPreTrainedModel):
                     past_key_values) > 0 else None
 
             self_attn_bias = self_abs_pos_bias.clone()
+            real_idx = 0 if self.share_attn_bias else idx
             if code_masks is None or not code_masks.any():
                 self_attn_bias += self.get_rel_pos_bias(
-                    all_prev_output_tokens, idx).unsqueeze(0)
+                    all_prev_output_tokens, real_idx).unsqueeze(0)
             elif code_masks is not None and code_masks.all():
                 self_attn_bias += self.get_image_rel_pos_bias(
-                    all_prev_output_tokens, idx).unsqueeze(0)
+                    all_prev_output_tokens, real_idx).unsqueeze(0)
             else:
                 self_attn_bias[~code_masks] += self.get_rel_pos_bias(
-                    all_prev_output_tokens, idx).unsqueeze(0)
+                    all_prev_output_tokens, real_idx).unsqueeze(0)
                 self_attn_bias[code_masks] += self.get_image_rel_pos_bias(
-                    all_prev_output_tokens, idx).unsqueeze(0)
+                    all_prev_output_tokens, real_idx).unsqueeze(0)
             self_attn_bias = self_attn_bias.reshape(
                 -1,
                 *self_attn_bias.size()[-2:])
@@ -1892,6 +1953,7 @@ class OFAModel(OFAPreTrainedModel):
 
         self.encoder = OFAEncoder(config, shared)
         self.decoder = OFADecoder(config, shared)
+        self.use_ofasys = config.use_ofasys
 
         # Initialize weights and apply final processing
         self.post_init()
diff --git a/modelscope/models/multi_modal/ofa/utils/utils.py b/modelscope/models/multi_modal/ofa/utils/utils.py
index 6d8943a1..c5aa8483 100644
--- a/modelscope/models/multi_modal/ofa/utils/utils.py
+++ b/modelscope/models/multi_modal/ofa/utils/utils.py
@@ -2,6 +2,7 @@
 from typing import Optional
 
 import torch
+import torch.nn as nn
 
 
 def expand_mask(mask: torch.Tensor,
@@ -17,3 +18,42 @@ def expand_mask(mask: torch.Tensor,
                                                   src_len).to(dtype)
     return expanded_mask.masked_fill(expanded_mask.bool(),
                                      torch.finfo(dtype).min)
+
+
+def drop_path(x, drop_prob: float = 0.0, training: bool = False):
+    r"""
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    Args:
+        x (`nn.Modules`): input nn layers.
+        drop_prob (`float`): drop path ratio.
+        training (`bool`): whether is training or inference.
+    """
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (1, x.shape[1], 1)
+    random_tensor = keep_prob + torch.rand(
+        shape, dtype=x.dtype, device=x.device)
+    random_tensor.floor_()  # binarize
+    output = x.div(keep_prob) * random_tensor
+    return output
+
+
+class DropPath(nn.Module):
+    r"""
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    Args:
+        drop_prob: drop path ratio.
+    """
+
+    def __init__(self, drop_prob=None):
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return 'p={}'.format(self.drop_prob)
diff --git a/modelscope/models/multi_modal/ofa/vit.py b/modelscope/models/multi_modal/ofa/vit.py
new file mode 100644
index 00000000..b6bba7ee
--- /dev/null
+++ b/modelscope/models/multi_modal/ofa/vit.py
@@ -0,0 +1,155 @@
+from collections import OrderedDict
+
+import torch
+import torch.nn.functional as F
+from fairseq.modules import LayerNorm
+from torch import nn
+
+from .utils.utils import DropPath
+
+__all__ = [
+    'vit_base',
+    'vit_large',
+    'vit_large_336',
+    'vit_huge',
+]
+
+
+class QuickGELU(nn.Module):
+
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+
+
+class ResidualAttentionBlock(nn.Module):
+
+    def __init__(self,
+                 d_model: int,
+                 n_head: int,
+                 attn_mask: torch.Tensor = None,
+                 drop_path_rate=0.0):
+        super().__init__()
+
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.ln_1 = LayerNorm(d_model)
+        self.mlp = nn.Sequential(
+            OrderedDict([
+                ('c_fc', nn.Linear(d_model, d_model * 4)),
+                ('gelu', QuickGELU()),
+                ('c_proj', nn.Linear(d_model * 4, d_model)),
+            ]))
+        self.ln_2 = LayerNorm(d_model)
+        self.attn_mask = attn_mask
+        self.drop_path = DropPath(drop_path_rate)
+
+    def attention(self, x: torch.Tensor):
+        self.attn_mask = (
+            self.attn_mask.to(dtype=x.dtype, device=x.device)
+            if self.attn_mask is not None else None)
+        return self.attn(
+            x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
+
+    def forward(self, x: torch.Tensor):
+        x = x + self.drop_path(self.attention(self.ln_1(x)))
+        x = x + self.drop_path(self.mlp(self.ln_2(x)))
+        return x
+
+
+class Transformer(nn.Module):
+
+    def __init__(
+        self,
+        width: int,
+        layers: int,
+        heads: int,
+        attn_mask: torch.Tensor = None,
+        drop_path_rate: float = 0.0,
+    ):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+        self.resblocks = nn.Sequential(*[
+            ResidualAttentionBlock(width, heads, attn_mask, drop_path_rate)
+            for _ in range(layers)
+        ])
+
+    def forward(self, x: torch.Tensor):
+        return self.resblocks(x)
+
+
+class VisionTransformer(nn.Module):
+
+    def __init__(
+        self,
+        input_resolution: int,
+        patch_size: int,
+        width: int,
+        layers: int,
+        heads: int,
+        drop_path_rate: float = 0.0,
+    ):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.patch_size = patch_size
+        self.conv1 = nn.Conv2d(
+            in_channels=3,
+            out_channels=width,
+            kernel_size=patch_size,
+            stride=patch_size,
+            bias=False,
+        )
+
+        scale = width**-0.5
+        self.width = width
+        self.positional_embedding = nn.Parameter(scale * torch.randn(
+            (input_resolution // patch_size)**2 + 1, width))
+        self.ln_pre = LayerNorm(width)
+        self.transformer = Transformer(
+            width, layers, heads, drop_path_rate=drop_path_rate)
+
+    def forward(self, x: torch.Tensor):
+        resolution = x.shape[-2]
+        height, width = x.shape[-2] // self.patch_size, x.shape[
+            -1] // self.patch_size
+        x = self.conv1(x)  # shape = [*, width, grid, grid]
+        x = x.reshape(x.shape[0], x.shape[1],
+                      -1)  # shape = [*, width, grid ** 2]
+        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+
+        if resolution != self.input_resolution:
+            old_pe = self.positional_embedding[1:]
+            patch_num = self.input_resolution // self.patch_size
+            old_pe = old_pe.reshape(1, patch_num, patch_num,
+                                    -1).permute(0, 3, 1, 2)
+            new_pe = F.interpolate(
+                old_pe, size=(height, width), mode='bilinear')
+            new_pe = new_pe.permute(0, 2, 3, 1).reshape(height * width, -1)
+            x = x + new_pe.to(x.dtype)
+        else:
+            x = x + self.positional_embedding[1:].to(x.dtype)
+        x = self.ln_pre(x)
+
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+
+        bz, seq, hidden = x.shape
+        x = x.transpose(1, 2).reshape(bz, hidden, height, width)
+
+        return x
+
+
+def vit_base(drop_path_rate: float = 0.0):
+    return VisionTransformer(224, 16, 768, 9, 12, drop_path_rate)
+
+
+def vit_large(drop_path_rate: float = 0.0):
+    return VisionTransformer(224, 14, 1024, 18, 16, drop_path_rate)
+
+
+def vit_large_336(drop_path_rate: float = 0.0):
+    return VisionTransformer(336, 14, 1024, 18, 16, drop_path_rate)
+
+
+def vit_huge(drop_path_rate: float = 0.0):
+    return VisionTransformer(224, 14, 1280, 24, 16, drop_path_rate)
diff --git a/modelscope/models/multi_modal/ofa_for_all_tasks.py b/modelscope/models/multi_modal/ofa_for_all_tasks.py
index 56d19ad8..2c6034e8 100644
--- a/modelscope/models/multi_modal/ofa_for_all_tasks.py
+++ b/modelscope/models/multi_modal/ofa_for_all_tasks.py
@@ -53,8 +53,11 @@ class OfaForAllTasks(TorchModel):
             raise NotImplementedError
         # there is some diff between here and our ofa code,
         # there will be no need to use param: use_bpe
-        self.tokenizer.add_tokens(['<code_{}>'.format(i) for i in range(8192)])
-        self.tokenizer.add_tokens(['<bin_{}>'.format(i) for i in range(1000)])
+        if not model.use_ofasys:
+            self.tokenizer.add_tokens(
+                ['<code_{}>'.format(i) for i in range(8192)])
+            self.tokenizer.add_tokens(
+                ['<bin_{}>'.format(i) for i in range(1000)])
         self.cfg.update({'num_bins': 1000, 'num_codes': 8192})
         self.batch_size = self.cfg.model.get('batch_size', 1)
         self.patch_image_size = self.cfg.model.get('patch_image_size', 480)

From e72988c2bae19c9c7bc7ea08bc940515a766bac7 Mon Sep 17 00:00:00 2001
From: "shouzhou.bx" <shouzhou.bx@alibaba-inc.com>
Date: Mon, 31 Oct 2022 20:46:49 +0800
Subject: [PATCH 14/46] add face detection to face_2d_keypoints_pipeline

---
 modelscope/outputs/outputs.py                 |  23 +-
 .../face_2d_keypoints_pipeline.py             | 254 +++++++++++++++++-
 modelscope/utils/cv/image_utils.py            |  65 +++++
 tests/pipelines/test_face_2d_keypoints.py     |  29 +-
 4 files changed, 347 insertions(+), 24 deletions(-)

diff --git a/modelscope/outputs/outputs.py b/modelscope/outputs/outputs.py
index b983125a..b7003809 100644
--- a/modelscope/outputs/outputs.py
+++ b/modelscope/outputs/outputs.py
@@ -69,11 +69,23 @@ TASK_OUTPUTS = {
     # face 2d keypoint result for single sample
     #   {
     #       "keypoints": [
-    #           [x1, y1]*106
+    #           [[x, y]*106],
+    #           [[x, y]*106],
+    #           [[x, y]*106],
     #       ],
-    #       "poses": [pitch, roll, yaw]
+    #       "poses": [
+    #            [pitch, roll, yaw],
+    #            [pitch, roll, yaw],
+    #            [pitch, roll, yaw],
+    #        ],
+    #        "boxes": [
+    #           [x1, y1, x2, y2],
+    #           [x1, y1, x2, y2],
+    #           [x1, y1, x2, y2],
+    #       ]
     #   }
-    Tasks.face_2d_keypoints: [OutputKeys.KEYPOINTS, OutputKeys.POSES],
+    Tasks.face_2d_keypoints:
+    [OutputKeys.KEYPOINTS, OutputKeys.POSES, OutputKeys.BOXES],
 
     # face detection result for single sample
     #   {
@@ -699,8 +711,9 @@ TASK_OUTPUTS = {
     #   "text_embedding": np.array with shape [1, D],
     #   "caption": "this is an image caption text."
     # }
-    Tasks.generative_multi_modal_embedding:
-    [OutputKeys.IMG_EMBEDDING, OutputKeys.TEXT_EMBEDDING, OutputKeys.CAPTION],
+    Tasks.generative_multi_modal_embedding: [
+        OutputKeys.IMG_EMBEDDING, OutputKeys.TEXT_EMBEDDING, OutputKeys.CAPTION
+    ],
 
     # multi-modal similarity result for single sample
     # {
diff --git a/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py b/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py
index b48d013e..4de5a4f2 100644
--- a/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py
+++ b/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py
@@ -1,9 +1,16 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import copy
+import math
 from typing import Any
 
+import cv2
+import numpy as np
+
 from modelscope.metainfo import Pipelines
 from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
 from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
 from modelscope.utils.constant import ModelFile, Tasks
 from .base import EasyCVPipeline
 
@@ -29,18 +36,251 @@ class Face2DKeypointsPipeline(EasyCVPipeline):
             *args,
             **kwargs)
 
+        # face detect pipeline
+        det_model_id = 'damo/cv_resnet_facedetection_scrfd10gkps'
+        self.face_detection = pipeline(
+            Tasks.face_detection, model=det_model_id)
+
     def show_result(self, img, points, scale=2, save_path=None):
         return self.predict_op.show_result(img, points, scale, save_path)
 
+    def _choose_face(self, det_result, min_face=10):
+        """
+        choose face with maximum area
+        Args:
+            det_result: output of face detection pipeline
+            min_face: minimum size of valid face w/h
+        """
+        bboxes = np.array(det_result[OutputKeys.BOXES])
+        landmarks = np.array(det_result[OutputKeys.KEYPOINTS])
+        if bboxes.shape[0] == 0:
+            logger.warn('No face detected!')
+            return None
+        # face idx with enough size
+        face_idx = []
+        for i in range(bboxes.shape[0]):
+            box = bboxes[i]
+            if (box[2] - box[0]) >= min_face and (box[3] - box[1]) >= min_face:
+                face_idx += [i]
+        if len(face_idx) == 0:
+            logger.warn(
+                f'Face size not enough, less than {min_face}x{min_face}!')
+            return None
+        bboxes = bboxes[face_idx]
+        landmarks = landmarks[face_idx]
+
+        return bboxes, landmarks
+
+    def expend_box(self, box, w, h, scalex=0.3, scaley=0.5):
+        x1 = box[0]
+        y1 = box[1]
+        wb = box[2] - x1
+        hb = box[3] - y1
+        deltax = int(wb * scalex)
+        deltay1 = int(hb * scaley)
+        deltay2 = int(hb * scalex)
+        x1 = x1 - deltax
+        y1 = y1 - deltay1
+        if x1 < 0:
+            deltax = deltax + x1
+            x1 = 0
+        if y1 < 0:
+            deltay1 = deltay1 + y1
+            y1 = 0
+        x2 = x1 + wb + 2 * deltax
+        y2 = y1 + hb + deltay1 + deltay2
+        x2 = np.clip(x2, 0, w - 1)
+        y2 = np.clip(y2, 0, h - 1)
+        return [x1, y1, x2, y2]
+
+    def rotate_point(self, angle, center, landmark):
+        rad = angle * np.pi / 180.0
+        alpha = np.cos(rad)
+        beta = np.sin(rad)
+        M = np.zeros((2, 3), dtype=np.float32)
+        M[0, 0] = alpha
+        M[0, 1] = beta
+        M[0, 2] = (1 - alpha) * center[0] - beta * center[1]
+        M[1, 0] = -beta
+        M[1, 1] = alpha
+        M[1, 2] = beta * center[0] + (1 - alpha) * center[1]
+
+        landmark_ = np.asarray([(M[0, 0] * x + M[0, 1] * y + M[0, 2],
+                                 M[1, 0] * x + M[1, 1] * y + M[1, 2])
+                                for (x, y) in landmark])
+        return M, landmark_
+
+    def random_normal(self):
+        """
+        3-sigma rule
+        return: (-1, +1)
+        """
+        mu, sigma = 0, 1
+        while True:
+            s = np.random.normal(mu, sigma)
+            if s < mu - 3 * sigma or s > mu + 3 * sigma:
+                continue
+            return s / 3 * sigma
+
+    def rotate_crop_img(self, img, pts, M):
+        image_size = 256
+        enlarge_ratio = 1.1
+
+        imgT = cv2.warpAffine(img, M, (int(img.shape[1]), int(img.shape[0])))
+
+        x1 = pts[5][0]
+        y1 = pts[5][1]
+        x2 = pts[6][0]
+        y2 = pts[6][1]
+        w = x2 - x1 + 1
+        h = y2 - y1 + 1
+        x1 = int(x1 - (enlarge_ratio - 1.0) / 2.0 * w)
+        y1 = int(y1 - (enlarge_ratio - 1.0) / 2.0 * h)
+
+        new_w = int(enlarge_ratio * (1 + self.random_normal() * 0.1) * w)
+        new_h = int(enlarge_ratio * (1 + self.random_normal() * 0.1) * h)
+        new_x1 = x1 + int(self.random_normal() * image_size * 0.05)
+        new_y1 = y1 + int(self.random_normal() * image_size * 0.05)
+        new_x2 = new_x1 + new_w
+        new_y2 = new_y1 + new_h
+
+        height, width, _ = imgT.shape
+        dx = max(0, -new_x1)
+        dy = max(0, -new_y1)
+        new_x1 = max(0, new_x1)
+        new_y1 = max(0, new_y1)
+
+        edx = max(0, new_x2 - width)
+        edy = max(0, new_y2 - height)
+        new_x2 = min(width, new_x2)
+        new_y2 = min(height, new_y2)
+
+        sub_imgT = imgT[new_y1:new_y2, new_x1:new_x2]
+        if dx > 0 or dy > 0 or edx > 0 or edy > 0:
+            sub_imgT = cv2.copyMakeBorder(
+                sub_imgT,
+                dy,
+                edy,
+                dx,
+                edx,
+                cv2.BORDER_CONSTANT,
+                value=(103.94, 116.78, 123.68))
+
+        return sub_imgT, imgT, [new_x1, new_y1, new_x2,
+                                new_y2], [dx, dy, edx, edy]
+
+    def crop_img(self, imgT, pts, angle):
+        image_size = 256
+        enlarge_ratio = 1.1
+
+        x1 = np.min(pts[:, 0])
+        x2 = np.max(pts[:, 0])
+        y1 = np.min(pts[:, 1])
+        y2 = np.max(pts[:, 1])
+        w = x2 - x1 + 1
+        h = y2 - y1 + 1
+        x1 = int(x1 - (enlarge_ratio - 1.0) / 2.0 * w)
+        y1 = int(y1 - (enlarge_ratio - 1.0) / 2.0 * h)
+
+        new_w = int(enlarge_ratio * (1 + self.random_normal() * 0.1) * w)
+        new_h = int(enlarge_ratio * (1 + self.random_normal() * 0.1) * h)
+        new_x1 = x1 + int(self.random_normal() * image_size * 0.05)
+        new_y1 = y1 + int(self.random_normal() * image_size * 0.05)
+        new_x2 = new_x1 + new_w
+        new_y2 = new_y1 + new_h
+
+        new_xy = new_x1, new_y1
+        pts = pts - new_xy
+
+        height, width, _ = imgT.shape
+        dx = max(0, -new_x1)
+        dy = max(0, -new_y1)
+        new_x1 = max(0, new_x1)
+        new_y1 = max(0, new_y1)
+
+        edx = max(0, new_x2 - width)
+        edy = max(0, new_y2 - height)
+        new_x2 = min(width, new_x2)
+        new_y2 = min(height, new_y2)
+
+        sub_imgT = imgT[new_y1:new_y2, new_x1:new_x2]
+        if dx > 0 or dy > 0 or edx > 0 or edy > 0:
+            sub_imgT = cv2.copyMakeBorder(
+                sub_imgT,
+                dy,
+                edy,
+                dx,
+                edx,
+                cv2.BORDER_CONSTANT,
+                value=(103.94, 116.78, 123.68))
+
+        return sub_imgT, [new_x1, new_y1, new_x2, new_y2], [dx, dy, edx, edy]
+
     def __call__(self, inputs) -> Any:
-        outputs = self.predict_op(inputs)
+        image_size = 256
+
+        img = LoadImage.convert_to_ndarray(inputs)
+        h, w, c = img.shape
+        img_rgb = copy.deepcopy(img)
+        img_rgb = img_rgb[:, :, ::-1]
+        det_result = self.face_detection(img_rgb)
+        boxes, keypoints = self._choose_face(det_result)
+
+        output_boxes = []
+        output_keypoints = []
+        output_poses = []
+        for idx, box_ori in enumerate(boxes):
+            box = self.expend_box(box_ori, w, h, scalex=0.15, scaley=0.15)
+            y0 = int(box[1])
+            y1 = int(box[3])
+            x0 = int(box[0])
+            x1 = int(box[2])
+            sub_img = img[y0:y1, x0:x1]
+
+            keypoint = keypoints[idx]
+            pts = [[keypoint[0], keypoint[1]], [keypoint[2], keypoint[3]],
+                   [keypoint[4], keypoint[5]], [keypoint[6], keypoint[7]],
+                   [keypoint[8], keypoint[9]], [box[0], box[1]],
+                   [box[2], box[3]]]
+            # radian
+            angle = math.atan2((pts[1][1] - pts[0][1]),
+                               (pts[1][0] - pts[0][0]))
+            # angle
+            theta = angle * (180 / np.pi)
+
+            center = [image_size // 2, image_size // 2]
+            cx, cy = center
+            M, landmark_ = self.rotate_point(theta, (cx, cy), pts)
+            sub_img, imgT, bbox, delta_border = self.rotate_crop_img(
+                img, pts, M)
+
+            outputs = self.predict_op([sub_img])[0]
+            tmp_keypoints = outputs['point']
+
+            for idx in range(0, len(tmp_keypoints)):
+                tmp_keypoints[idx][0] += (delta_border[0] + bbox[0])
+                tmp_keypoints[idx][1] += (delta_border[1] + bbox[1])
+
+            for idx in range(0, 3):
+                sub_img, bbox, delta_border = self.crop_img(
+                    imgT, tmp_keypoints, 0)
+                outputs = self.predict_op([sub_img])[0]
+                tmp_keypoints = outputs['point']
+                for idx in range(0, len(tmp_keypoints)):
+                    tmp_keypoints[idx][0] += (delta_border[0] + bbox[0])
+                    tmp_keypoints[idx][1] += (delta_border[1] + bbox[1])
+
+            M2, tmp_keypoints = self.rotate_point(-theta, (cx, cy),
+                                                  tmp_keypoints)
 
-        results = [{
-            OutputKeys.KEYPOINTS: output['point'],
-            OutputKeys.POSES: output['pose']
-        } for output in outputs]
+            output_keypoints.append(np.array(tmp_keypoints))
+            output_poses.append(np.array(outputs['pose']))
+            output_boxes.append(np.array(box_ori))
 
-        if self._is_single_inputs(inputs):
-            results = results[0]
+        results = {
+            OutputKeys.KEYPOINTS: output_keypoints,
+            OutputKeys.POSES: output_poses,
+            OutputKeys.BOXES: output_boxes
+        }
 
         return results
diff --git a/modelscope/utils/cv/image_utils.py b/modelscope/utils/cv/image_utils.py
index 34dc2348..095c36ec 100644
--- a/modelscope/utils/cv/image_utils.py
+++ b/modelscope/utils/cv/image_utils.py
@@ -91,6 +91,71 @@ def draw_keypoints(output, original_image):
     return image
 
 
+def draw_106face_keypoints(in_path,
+                           keypoints,
+                           boxes,
+                           scale=4.0,
+                           save_path=None):
+    face_contour_point_index = [
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+        20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
+    ]
+    left_eye_brow_point_index = [33, 34, 35, 36, 37, 38, 39, 40, 41, 33]
+    right_eye_brow_point_index = [42, 43, 44, 45, 46, 47, 48, 49, 50, 42]
+    left_eye_point_index = [66, 67, 68, 69, 70, 71, 72, 73, 66]
+    right_eye_point_index = [75, 76, 77, 78, 79, 80, 81, 82, 75]
+    nose_bridge_point_index = [51, 52, 53, 54]
+    nose_contour_point_index = [55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65]
+    mouth_outer_point_index = [
+        84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 84
+    ]
+    mouth_inter_point_index = [96, 97, 98, 99, 100, 101, 102, 103, 96]
+
+    img = cv2.imread(in_path)
+
+    for i in range(len(boxes)):
+        draw_box(img, np.array(boxes[i]))
+
+    image = cv2.resize(img, dsize=None, fx=scale, fy=scale)
+
+    def draw_line(point_index, image, point):
+        for i in range(len(point_index) - 1):
+            cur_index = point_index[i]
+            next_index = point_index[i + 1]
+            cur_pt = (int(point[cur_index][0] * scale),
+                      int(point[cur_index][1] * scale))
+            next_pt = (int(point[next_index][0] * scale),
+                       int(point[next_index][1] * scale))
+            cv2.line(image, cur_pt, next_pt, (0, 0, 255), thickness=2)
+
+    for i in range(len(keypoints)):
+        points = keypoints[i]
+
+        draw_line(face_contour_point_index, image, points)
+        draw_line(left_eye_brow_point_index, image, points)
+        draw_line(right_eye_brow_point_index, image, points)
+        draw_line(left_eye_point_index, image, points)
+        draw_line(right_eye_point_index, image, points)
+        draw_line(nose_bridge_point_index, image, points)
+        draw_line(nose_contour_point_index, image, points)
+        draw_line(mouth_outer_point_index, image, points)
+        draw_line(mouth_inter_point_index, image, points)
+
+        size = len(points)
+        for i in range(size):
+            x = int(points[i][0])
+            y = int(points[i][1])
+            cv2.putText(image, str(i), (int(x * scale), int(y * scale)),
+                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
+            cv2.circle(image, (int(x * scale), int(y * scale)), 2, (0, 255, 0),
+                       cv2.FILLED)
+
+    if save_path is not None:
+        cv2.imwrite(save_path, image)
+
+    return image
+
+
 def draw_face_detection_no_lm_result(img_path, detection_result):
     bboxes = np.array(detection_result[OutputKeys.BOXES])
     scores = np.array(detection_result[OutputKeys.SCORES])
diff --git a/tests/pipelines/test_face_2d_keypoints.py b/tests/pipelines/test_face_2d_keypoints.py
index a5e347e8..7ccc8a59 100644
--- a/tests/pipelines/test_face_2d_keypoints.py
+++ b/tests/pipelines/test_face_2d_keypoints.py
@@ -1,11 +1,10 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import unittest
 
-import cv2
-
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.cv.image_utils import draw_106face_keypoints
 from modelscope.utils.test_utils import test_level
 
 
@@ -13,7 +12,7 @@ class EasyCVFace2DKeypointsPipelineTest(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_face_2d_keypoints(self):
-        img_path = 'data/test/images/keypoints_detect/test_img_face_2d_keypoints.png'
+        img_path = 'data/test/images/face_detection.png'
         model_id = 'damo/cv_mobilenet_face-2d-keypoints_alignment'
 
         face_2d_keypoints_align = pipeline(
@@ -21,15 +20,21 @@ class EasyCVFace2DKeypointsPipelineTest(unittest.TestCase):
         output = face_2d_keypoints_align(img_path)
 
         output_keypoints = output[OutputKeys.KEYPOINTS]
-        output_pose = output[OutputKeys.POSES]
-
-        img = cv2.imread(img_path)
-        img = face_2d_keypoints_align.show_result(
-            img, output_keypoints, scale=2, save_path='face_keypoints.jpg')
-
-        self.assertEqual(output_keypoints.shape[0], 106)
-        self.assertEqual(output_keypoints.shape[1], 2)
-        self.assertEqual(output_pose.shape[0], 3)
+        output_poses = output[OutputKeys.POSES]
+        output_boxes = output[OutputKeys.BOXES]
+
+        draw_106face_keypoints(
+            img_path,
+            output_keypoints,
+            output_boxes,
+            scale=2,
+            save_path='face_keypoints.jpg')
+
+        for idx in range(len(output_keypoints)):
+            self.assertEqual(output_keypoints[idx].shape[0], 106)
+            self.assertEqual(output_keypoints[idx].shape[1], 2)
+            self.assertEqual(output_poses[idx].shape[0], 3)
+            self.assertEqual(output_boxes[idx].shape[0], 4)
 
 
 if __name__ == '__main__':

From 0d3b7b0df210418326295c4cbe1c07152e540af0 Mon Sep 17 00:00:00 2001
From: "zhangzhicheng.zzc" <zhangzhicheng.zzc@alibaba-inc.com>
Date: Mon, 31 Oct 2022 20:52:27 +0800
Subject: [PATCH 15/46] [to #42322933]fix bugs relate to token cls
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1.修复token classification preprocessor finetune结果错误问题
2.修复word segmentation output 无用属性
3. 修复nlp preprocessor传use_fast错误
4. 修复torch model exporter bug
5. 修复文档撰写过程中发现trainer相关bug
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10573269
---
 modelscope/exporters/torch_model_exporter.py  |   5 +-
 modelscope/outputs/outputs.py                 |  11 +-
 .../nlp/token_classification_pipeline.py      |   4 +-
 .../nlp/word_segmentation_pipeline.py         |   6 +-
 modelscope/preprocessors/nlp/nlp_base.py      |  17 +-
 .../nlp/token_classification_preprocessor.py  | 148 ++++++++++--------
 .../trainers/nlp/text_generation_trainer.py   |   2 +-
 modelscope/trainers/nlp_trainer.py            |   6 +-
 modelscope/trainers/trainer.py                |   2 +-
 tests/outputs/test_model_outputs.py           |   3 +-
 .../test_finetune_token_classificatin.py      |   2 +-
 11 files changed, 110 insertions(+), 96 deletions(-)

diff --git a/modelscope/exporters/torch_model_exporter.py b/modelscope/exporters/torch_model_exporter.py
index 7bf6c0c0..1d332591 100644
--- a/modelscope/exporters/torch_model_exporter.py
+++ b/modelscope/exporters/torch_model_exporter.py
@@ -128,7 +128,7 @@ class TorchModelExporter(Exporter):
                 args_list = list(args)
             else:
                 args_list = [args]
-            if isinstance(args_list[-1], dict):
+            if isinstance(args_list[-1], Mapping):
                 args_dict = args_list[-1]
                 args_list = args_list[:-1]
             n_nonkeyword = len(args_list)
@@ -284,9 +284,8 @@ class TorchModelExporter(Exporter):
                 'Model property dummy_inputs must be set.')
         dummy_inputs = collate_fn(dummy_inputs, device)
         if isinstance(dummy_inputs, Mapping):
-            dummy_inputs = self._decide_input_format(model, dummy_inputs)
             dummy_inputs_filter = []
-            for _input in dummy_inputs:
+            for _input in self._decide_input_format(model, dummy_inputs):
                 if _input is not None:
                     dummy_inputs_filter.append(_input)
                 else:
diff --git a/modelscope/outputs/outputs.py b/modelscope/outputs/outputs.py
index b7003809..2c6dd85a 100644
--- a/modelscope/outputs/outputs.py
+++ b/modelscope/outputs/outputs.py
@@ -491,17 +491,8 @@ TASK_OUTPUTS = {
     # word segmentation result for single sample
     # {
     #   "output": "今天 天气 不错 ， 适合 出去 游玩"
-    #   "labels": [
-    #     {'word': '今天', 'label': 'PROPN'},
-    #     {'word': '天气', 'label': 'PROPN'},
-    #     {'word': '不错', 'label': 'VERB'},
-    #     {'word': ',', 'label': 'NUM'},
-    #     {'word': '适合', 'label': 'NOUN'},
-    #     {'word': '出去', 'label': 'PART'},
-    #     {'word': '游玩', 'label': 'ADV'},
-    # ]
     # }
-    Tasks.word_segmentation: [OutputKeys.OUTPUT, OutputKeys.LABELS],
+    Tasks.word_segmentation: [OutputKeys.OUTPUT],
 
     # TODO @wenmeng.zwm support list of result check
     # named entity recognition result for single sample
diff --git a/modelscope/pipelines/nlp/token_classification_pipeline.py b/modelscope/pipelines/nlp/token_classification_pipeline.py
index 75bc538d..4af187ee 100644
--- a/modelscope/pipelines/nlp/token_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/token_classification_pipeline.py
@@ -109,13 +109,13 @@ class TokenClassificationPipeline(Pipeline):
             chunk['span'] = text[chunk['start']:chunk['end']]
             chunks.append(chunk)
 
-        # for cws output
+        # for cws outputs
         if len(chunks) > 0 and chunks[0]['type'] == 'cws':
             spans = [
                 chunk['span'] for chunk in chunks if chunk['span'].strip()
             ]
             seg_result = ' '.join(spans)
-            outputs = {OutputKeys.OUTPUT: seg_result, OutputKeys.LABELS: []}
+            outputs = {OutputKeys.OUTPUT: seg_result}
 
         # for ner outputs
         else:
diff --git a/modelscope/pipelines/nlp/word_segmentation_pipeline.py b/modelscope/pipelines/nlp/word_segmentation_pipeline.py
index 0df8f1ad..c57f6b93 100644
--- a/modelscope/pipelines/nlp/word_segmentation_pipeline.py
+++ b/modelscope/pipelines/nlp/word_segmentation_pipeline.py
@@ -115,15 +115,15 @@ class WordSegmentationPipeline(Pipeline):
             chunk['span'] = text[chunk['start']:chunk['end']]
             chunks.append(chunk)
 
-        # for cws output
+        # for cws outputs
         if len(chunks) > 0 and chunks[0]['type'] == 'cws':
             spans = [
                 chunk['span'] for chunk in chunks if chunk['span'].strip()
             ]
             seg_result = ' '.join(spans)
-            outputs = {OutputKeys.OUTPUT: seg_result, OutputKeys.LABELS: []}
+            outputs = {OutputKeys.OUTPUT: seg_result}
 
-        # for ner outpus
+        # for ner output
         else:
             outputs = {OutputKeys.OUTPUT: chunks}
         return outputs
diff --git a/modelscope/preprocessors/nlp/nlp_base.py b/modelscope/preprocessors/nlp/nlp_base.py
index 48a04d7a..45efc6e7 100644
--- a/modelscope/preprocessors/nlp/nlp_base.py
+++ b/modelscope/preprocessors/nlp/nlp_base.py
@@ -34,6 +34,7 @@ class NLPBasePreprocessor(Preprocessor, ABC):
                  label=None,
                  label2id=None,
                  mode=ModeKeys.INFERENCE,
+                 use_fast=None,
                  **kwargs):
         """The NLP preprocessor base class.
 
@@ -45,14 +46,18 @@ class NLPBasePreprocessor(Preprocessor, ABC):
             label2id: An optional label2id mapping, the class will try to call utils.parse_label_mapping
                 if this mapping is not supplied.
             mode: Run this preprocessor in either 'train'/'eval'/'inference' mode
+            use_fast: use the fast version of tokenizer
+
         """
         self.model_dir = model_dir
         self.first_sequence = first_sequence
         self.second_sequence = second_sequence
         self.label = label
 
-        self.use_fast = kwargs.pop('use_fast', None)
-        if self.use_fast is None and os.path.isfile(
+        self.use_fast = use_fast
+        if self.use_fast is None and model_dir is None:
+            self.use_fast = False
+        elif self.use_fast is None and os.path.isfile(
                 os.path.join(model_dir, 'tokenizer_config.json')):
             with open(os.path.join(model_dir, 'tokenizer_config.json'),
                       'r') as f:
@@ -61,8 +66,8 @@ class NLPBasePreprocessor(Preprocessor, ABC):
         self.use_fast = False if self.use_fast is None else self.use_fast
 
         self.label2id = label2id
-        if self.label2id is None:
-            self.label2id = parse_label_mapping(self.model_dir)
+        if self.label2id is None and model_dir is not None:
+            self.label2id = parse_label_mapping(model_dir)
         super().__init__(mode, **kwargs)
 
     @property
@@ -106,6 +111,7 @@ class NLPTokenizerPreprocessorBase(NLPBasePreprocessor):
                  label: str = 'label',
                  label2id: dict = None,
                  mode: str = ModeKeys.INFERENCE,
+                 use_fast: bool = None,
                  **kwargs):
         """The NLP tokenizer preprocessor base class.
 
@@ -122,11 +128,12 @@ class NLPTokenizerPreprocessorBase(NLPBasePreprocessor):
                 - config.json label2id/id2label
                 - label_mapping.json
             mode: Run this preprocessor in either 'train'/'eval'/'inference' mode, the behavior may be different.
+            use_fast: use the fast version of tokenizer
             kwargs: These kwargs will be directly fed into the tokenizer.
         """
 
         super().__init__(model_dir, first_sequence, second_sequence, label,
-                         label2id, mode)
+                         label2id, mode, use_fast, **kwargs)
         self.model_dir = model_dir
         self.tokenize_kwargs = kwargs
         self.tokenizer = self.build_tokenizer(model_dir)
diff --git a/modelscope/preprocessors/nlp/token_classification_preprocessor.py b/modelscope/preprocessors/nlp/token_classification_preprocessor.py
index 2de0c806..5069048b 100644
--- a/modelscope/preprocessors/nlp/token_classification_preprocessor.py
+++ b/modelscope/preprocessors/nlp/token_classification_preprocessor.py
@@ -2,6 +2,7 @@
 
 from typing import Any, Dict, Tuple, Union
 
+import numpy as np
 import torch
 
 from modelscope.metainfo import Preprocessors
@@ -20,9 +21,7 @@ class WordSegmentationBlankSetToLabelPreprocessor(NLPBasePreprocessor):
     """
 
     def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-        self.first_sequence: str = kwargs.pop('first_sequence',
-                                              'first_sequence')
+        self.first_sequence: str = kwargs.pop('first_sequence', 'tokens')
         self.label = kwargs.pop('label', OutputKeys.LABELS)
 
     def __call__(self, data: str) -> Union[Dict[str, Any], Tuple]:
@@ -80,10 +79,9 @@ class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase):
                 'is_split_into_words', False)
         if 'label2id' in kwargs:
             kwargs.pop('label2id')
-        self.tokenize_kwargs = kwargs
 
-    @type_assert(object, str)
-    def __call__(self, data: str) -> Dict[str, Any]:
+    @type_assert(object, (str, dict))
+    def __call__(self, data: Union[dict, str]) -> Dict[str, Any]:
         """process the raw input data
 
         Args:
@@ -99,18 +97,24 @@ class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase):
         text = None
         labels_list = None
         if isinstance(data, str):
+            # for inference inputs without label
             text = data
+            self.tokenize_kwargs['add_special_tokens'] = False
         elif isinstance(data, dict):
+            # for finetune inputs with label
             text = data.get(self.first_sequence)
             labels_list = data.get(self.label)
+            if isinstance(text, list):
+                self.tokenize_kwargs['is_split_into_words'] = True
 
         input_ids = []
         label_mask = []
         offset_mapping = []
-        if self.is_split_into_words:
-            for offset, token in enumerate(list(data)):
-                subtoken_ids = self.tokenizer.encode(
-                    token, add_special_tokens=False)
+        token_type_ids = []
+        if self.is_split_into_words and self._mode == ModeKeys.INFERENCE:
+            for offset, token in enumerate(list(text)):
+                subtoken_ids = self.tokenizer.encode(token,
+                                                     **self.tokenize_kwargs)
                 if len(subtoken_ids) == 0:
                     subtoken_ids = [self.tokenizer.unk_token_id]
                 input_ids.extend(subtoken_ids)
@@ -119,10 +123,9 @@ class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase):
         else:
             if self.tokenizer.is_fast:
                 encodings = self.tokenizer(
-                    text,
-                    add_special_tokens=False,
-                    return_offsets_mapping=True,
-                    **self.tokenize_kwargs)
+                    text, return_offsets_mapping=True, **self.tokenize_kwargs)
+                attention_mask = encodings['attention_mask']
+                token_type_ids = encodings['token_type_ids']
                 input_ids = encodings['input_ids']
                 word_ids = encodings.word_ids()
                 for i in range(len(word_ids)):
@@ -143,69 +146,80 @@ class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase):
                 label_mask, offset_mapping = self.get_label_mask_and_offset_mapping(
                     text)
 
-        if len(input_ids) >= self.sequence_length - 2:
-            input_ids = input_ids[:self.sequence_length - 2]
-            label_mask = label_mask[:self.sequence_length - 2]
-        input_ids = [self.tokenizer.cls_token_id
-                     ] + input_ids + [self.tokenizer.sep_token_id]
-        label_mask = [0] + label_mask + [0]
-        attention_mask = [1] * len(input_ids)
-        offset_mapping = offset_mapping[:sum(label_mask)]
+        if self._mode == ModeKeys.INFERENCE:
+            if len(input_ids) >= self.sequence_length - 2:
+                input_ids = input_ids[:self.sequence_length - 2]
+                label_mask = label_mask[:self.sequence_length - 2]
+            input_ids = [self.tokenizer.cls_token_id
+                         ] + input_ids + [self.tokenizer.sep_token_id]
+            label_mask = [0] + label_mask + [0]
+            attention_mask = [1] * len(input_ids)
+            offset_mapping = offset_mapping[:sum(label_mask)]
 
-        if not self.is_transformer_based_model:
-            input_ids = input_ids[1:-1]
-            attention_mask = attention_mask[1:-1]
-            label_mask = label_mask[1:-1]
+            if not self.is_transformer_based_model:
+                input_ids = input_ids[1:-1]
+                attention_mask = attention_mask[1:-1]
+                label_mask = label_mask[1:-1]
 
-        if self._mode == ModeKeys.INFERENCE:
             input_ids = torch.tensor(input_ids).unsqueeze(0)
             attention_mask = torch.tensor(attention_mask).unsqueeze(0)
             label_mask = torch.tensor(
                 label_mask, dtype=torch.bool).unsqueeze(0)
 
-        # the token classification
-        output = {
-            'text': text,
-            'input_ids': input_ids,
-            'attention_mask': attention_mask,
-            'label_mask': label_mask,
-            'offset_mapping': offset_mapping
-        }
-
-        # align the labels with tokenized text
-        if labels_list is not None:
-            assert self.label2id is not None
-            # Map that sends B-Xxx label to its I-Xxx counterpart
-            b_to_i_label = []
-            label_enumerate_values = [
-                k for k, v in sorted(
-                    self.label2id.items(), key=lambda item: item[1])
-            ]
-            for idx, label in enumerate(label_enumerate_values):
-                if label.startswith('B-') and label.replace(
-                        'B-', 'I-') in label_enumerate_values:
-                    b_to_i_label.append(
-                        label_enumerate_values.index(
-                            label.replace('B-', 'I-')))
-                else:
-                    b_to_i_label.append(idx)
+            # the token classification
+            output = {
+                'text': text,
+                'input_ids': input_ids,
+                'attention_mask': attention_mask,
+                'label_mask': label_mask,
+                'offset_mapping': offset_mapping
+            }
+        else:
+            output = {
+                'input_ids': input_ids,
+                'token_type_ids': token_type_ids,
+                'attention_mask': attention_mask,
+                'label_mask': label_mask,
+            }
 
-            label_row = [self.label2id[lb] for lb in labels_list]
-            previous_word_idx = None
-            label_ids = []
-            for word_idx in word_ids:
-                if word_idx is None:
-                    label_ids.append(-100)
-                elif word_idx != previous_word_idx:
-                    label_ids.append(label_row[word_idx])
-                else:
-                    if self.label_all_tokens:
-                        label_ids.append(b_to_i_label[label_row[word_idx]])
+            # align the labels with tokenized text
+            if labels_list is not None:
+                assert self.label2id is not None
+                # Map that sends B-Xxx label to its I-Xxx counterpart
+                b_to_i_label = []
+                label_enumerate_values = [
+                    k for k, v in sorted(
+                        self.label2id.items(), key=lambda item: item[1])
+                ]
+                for idx, label in enumerate(label_enumerate_values):
+                    if label.startswith('B-') and label.replace(
+                            'B-', 'I-') in label_enumerate_values:
+                        b_to_i_label.append(
+                            label_enumerate_values.index(
+                                label.replace('B-', 'I-')))
                     else:
+                        b_to_i_label.append(idx)
+
+                label_row = [self.label2id[lb] for lb in labels_list]
+                previous_word_idx = None
+                label_ids = []
+                for word_idx in word_ids:
+                    if word_idx is None:
                         label_ids.append(-100)
-                previous_word_idx = word_idx
-            labels = label_ids
-            output['labels'] = labels
+                    elif word_idx != previous_word_idx:
+                        label_ids.append(label_row[word_idx])
+                    else:
+                        if self.label_all_tokens:
+                            label_ids.append(b_to_i_label[label_row[word_idx]])
+                        else:
+                            label_ids.append(-100)
+                    previous_word_idx = word_idx
+                labels = label_ids
+                output['labels'] = labels
+            output = {
+                k: np.array(v) if isinstance(v, list) else v
+                for k, v in output.items()
+            }
         return output
 
     def get_tokenizer_class(self):
diff --git a/modelscope/trainers/nlp/text_generation_trainer.py b/modelscope/trainers/nlp/text_generation_trainer.py
index 0e26f153..f02faf71 100644
--- a/modelscope/trainers/nlp/text_generation_trainer.py
+++ b/modelscope/trainers/nlp/text_generation_trainer.py
@@ -18,7 +18,7 @@ class TextGenerationTrainer(NlpEpochBasedTrainer):
         return tokenizer.decode(tokens.tolist(), skip_special_tokens=True)
 
     def evaluation_step(self, data):
-        model = self.model
+        model = self.model.module if self._dist else self.model
         model.eval()
 
         with torch.no_grad():
diff --git a/modelscope/trainers/nlp_trainer.py b/modelscope/trainers/nlp_trainer.py
index a92a3706..5ff6f62f 100644
--- a/modelscope/trainers/nlp_trainer.py
+++ b/modelscope/trainers/nlp_trainer.py
@@ -586,14 +586,16 @@ class NlpEpochBasedTrainer(EpochBasedTrainer):
             preprocessor_mode=ModeKeys.TRAIN,
             **model_args,
             **self.train_keys,
-            mode=ModeKeys.TRAIN)
+            mode=ModeKeys.TRAIN,
+            use_fast=True)
         eval_preprocessor = Preprocessor.from_pretrained(
             self.model_dir,
             cfg_dict=self.cfg,
             preprocessor_mode=ModeKeys.EVAL,
             **model_args,
             **self.eval_keys,
-            mode=ModeKeys.EVAL)
+            mode=ModeKeys.EVAL,
+            use_fast=True)
         return train_preprocessor, eval_preprocessor
 
 
diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py
index 7478d8e4..3556badf 100644
--- a/modelscope/trainers/trainer.py
+++ b/modelscope/trainers/trainer.py
@@ -876,7 +876,7 @@ class EpochBasedTrainer(BaseTrainer):
         Subclass and override to inject custom behavior.
 
         """
-        model = self.model
+        model = self.model.module if self._dist else self.model
         model.eval()
 
         if is_parallel(model):
diff --git a/tests/outputs/test_model_outputs.py b/tests/outputs/test_model_outputs.py
index 31271869..311ce201 100644
--- a/tests/outputs/test_model_outputs.py
+++ b/tests/outputs/test_model_outputs.py
@@ -21,9 +21,10 @@ class TestModelOutput(unittest.TestCase):
         self.assertEqual(outputs['logits'], torch.Tensor([1]))
         self.assertEqual(outputs[0], torch.Tensor([1]))
         self.assertEqual(outputs.logits, torch.Tensor([1]))
+        outputs.loss = torch.Tensor([2])
         logits, loss = outputs
         self.assertEqual(logits, torch.Tensor([1]))
-        self.assertTrue(loss is None)
+        self.assertTrue(loss is not None)
 
 
 if __name__ == '__main__':
diff --git a/tests/trainers/test_finetune_token_classificatin.py b/tests/trainers/test_finetune_token_classificatin.py
index 9bdab9b7..a92cee7b 100644
--- a/tests/trainers/test_finetune_token_classificatin.py
+++ b/tests/trainers/test_finetune_token_classificatin.py
@@ -87,7 +87,7 @@ class TestFinetuneTokenClassification(unittest.TestCase):
             cfg['dataset'] = {
                 'train': {
                     'labels': label_enumerate_values,
-                    'first_sequence': 'first_sequence',
+                    'first_sequence': 'tokens',
                     'label': 'labels',
                 }
             }

From 3464324f6b5d9d0ef975cd0b0e76870e95b5fa22 Mon Sep 17 00:00:00 2001
From: Yingda Chen <yingda.chen@alibaba-inc.com>
Date: Mon, 31 Oct 2022 22:15:25 +0800
Subject: [PATCH 16/46] [to #42322933] limit datasets version for now

---
 requirements/framework.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/requirements/framework.txt b/requirements/framework.txt
index 2408cda6..17fbd8a3 100644
--- a/requirements/framework.txt
+++ b/requirements/framework.txt
@@ -1,6 +1,7 @@
 addict
 attrs
-datasets
+# version beyond 2.6.0 introduces compatbility issue and is being resolved
+datasets<=2.6.0
 easydict
 einops
 filelock>=3.3.0

From 5302259a0a3fb7cafdce473aa78990e7dc84e676 Mon Sep 17 00:00:00 2001
From: "mulin.lyh" <mulin.lyh@taobao.com>
Date: Mon, 31 Oct 2022 22:46:17 +0800
Subject: [PATCH 17/46] [to #45854437]fix: add user name to user-agent        
 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10584797

---
 modelscope/hub/api.py       | 9 +++++++--
 modelscope/hub/constants.py | 1 +
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py
index dca6d099..7468e5e3 100644
--- a/modelscope/hub/api.py
+++ b/modelscope/hub/api.py
@@ -23,7 +23,8 @@ from modelscope.hub.constants import (API_RESPONSE_FIELD_DATA,
                                       API_RESPONSE_FIELD_MESSAGE,
                                       API_RESPONSE_FIELD_USERNAME,
                                       DEFAULT_CREDENTIALS_PATH,
-                                      MODELSCOPE_ENVIRONMENT, ONE_YEAR_SECONDS,
+                                      MODELSCOPE_ENVIRONMENT,
+                                      MODELSCOPE_USERNAME, ONE_YEAR_SECONDS,
                                       Licenses, ModelVisibility)
 from modelscope.hub.errors import (InvalidParameter, NotExistError,
                                    NotLoginException, NoValidRevisionError,
@@ -760,14 +761,18 @@ class ModelScopeConfig:
         env = 'custom'
         if MODELSCOPE_ENVIRONMENT in os.environ:
             env = os.environ[MODELSCOPE_ENVIRONMENT]
+        user_name = 'unknown'
+        if MODELSCOPE_USERNAME in os.environ:
+            user_name = os.environ[MODELSCOPE_USERNAME]
 
-        ua = 'modelscope/%s; python/%s; session_id/%s; platform/%s; processor/%s; env/%s' % (
+        ua = 'modelscope/%s; python/%s; session_id/%s; platform/%s; processor/%s; env/%s; user/%s' % (
             __version__,
             platform.python_version(),
             ModelScopeConfig.get_user_session_id(),
             platform.platform(),
             platform.processor(),
             env,
+            user_name,
         )
         if isinstance(user_agent, dict):
             ua = '; '.join(f'{k}/{v}' for k, v in user_agent.items())
diff --git a/modelscope/hub/constants.py b/modelscope/hub/constants.py
index 730702c1..373a0cf4 100644
--- a/modelscope/hub/constants.py
+++ b/modelscope/hub/constants.py
@@ -18,6 +18,7 @@ API_RESPONSE_FIELD_EMAIL = 'Email'
 API_RESPONSE_FIELD_MESSAGE = 'Message'
 MODELSCOPE_ENVIRONMENT = 'MODELSCOPE_ENVIRONMENT'
 MODELSCOPE_SDK_DEBUG = 'MODELSCOPE_SDK_DEBUG'
+MODELSCOPE_USERNAME = 'MODELSCOPE_USERNAME'
 ONE_YEAR_SECONDS = 24 * 365 * 60 * 60
 
 

From 06abae4dc6d68e99cba56608c857de5cdabd16b0 Mon Sep 17 00:00:00 2001
From: "zhangzhicheng.zzc" <zhangzhicheng.zzc@alibaba-inc.com>
Date: Tue, 1 Nov 2022 09:56:15 +0800
Subject: [PATCH 18/46] [to #42322933]add token-cls test cases and bug fix     
    Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10585502

---
 .../nlp/token_classification_preprocessor.py              | 3 +--
 tests/pipelines/test_named_entity_recognition.py          | 8 ++++++++
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/modelscope/preprocessors/nlp/token_classification_preprocessor.py b/modelscope/preprocessors/nlp/token_classification_preprocessor.py
index 5069048b..92b7c46b 100644
--- a/modelscope/preprocessors/nlp/token_classification_preprocessor.py
+++ b/modelscope/preprocessors/nlp/token_classification_preprocessor.py
@@ -140,8 +140,7 @@ class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase):
                         label_mask.append(1)
                         offset_mapping.append(encodings['offset_mapping'][i])
             else:
-                encodings = self.tokenizer(
-                    text, add_special_tokens=False, **self.tokenize_kwargs)
+                encodings = self.tokenizer(text, **self.tokenize_kwargs)
                 input_ids = encodings['input_ids']
                 label_mask, offset_mapping = self.get_label_mask_and_offset_mapping(
                     text)
diff --git a/tests/pipelines/test_named_entity_recognition.py b/tests/pipelines/test_named_entity_recognition.py
index 3658cf3f..aef4aaed 100644
--- a/tests/pipelines/test_named_entity_recognition.py
+++ b/tests/pipelines/test_named_entity_recognition.py
@@ -19,9 +19,11 @@ class NamedEntityRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):
         self.task = Tasks.named_entity_recognition
         self.model_id = 'damo/nlp_raner_named-entity-recognition_chinese-base-news'
 
+    english_model_id = 'damo/nlp_raner_named-entity-recognition_english-large-ecom'
     tcrf_model_id = 'damo/nlp_raner_named-entity-recognition_chinese-base-news'
     lcrf_model_id = 'damo/nlp_lstm_named-entity-recognition_chinese-news'
     sentence = '这与温岭市新河镇的一个神秘的传说有关。'
+    sentence_en = 'pizza shovel'
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_tcrf_by_direct_model_download(self):
@@ -89,6 +91,12 @@ class NamedEntityRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):
             task=Tasks.named_entity_recognition, model=self.lcrf_model_id)
         print(pipeline_ins(input=self.sentence))
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_english_with_model_name(self):
+        pipeline_ins = pipeline(
+            task=Tasks.named_entity_recognition, model=self.english_model_id)
+        print(pipeline_ins(input='pizza shovel'))
+
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
         pipeline_ins = pipeline(task=Tasks.named_entity_recognition)

From 9187103e3a32d4048e79e57d23fa596b2d1bffd5 Mon Sep 17 00:00:00 2001
From: "yichang.zyc" <yichang.zyc@alibaba-inc.com>
Date: Tue, 1 Nov 2022 09:57:31 +0800
Subject: [PATCH 19/46] =?UTF-8?q?[to=20#42322933]=E5=85=BC=E5=AE=B9?=
 =?UTF-8?q?=E6=96=B0=E5=A2=9Eclip=20huge=E6=A8=A1=E5=9E=8B=20=20=20=20=20?=
 =?UTF-8?q?=20=20=20=20Link:=20https://code.alibaba-inc.com/Ali-MaaS/MaaS-?=
 =?UTF-8?q?lib/codereview/10585552?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

    * compatiable with vit huge, and set clip base default mm-ebed pipeline
---
 modelscope/models/multi_modal/clip/model.py | 6 ++++--
 modelscope/pipelines/builder.py             | 5 ++---
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/modelscope/models/multi_modal/clip/model.py b/modelscope/models/multi_modal/clip/model.py
index b1c84292..9b82e4a1 100644
--- a/modelscope/models/multi_modal/clip/model.py
+++ b/modelscope/models/multi_modal/clip/model.py
@@ -349,11 +349,13 @@ class CLIP(nn.Module):
         text_num_hidden_layers: int,
         text_type_vocab_size: int,
         tokenizer: FullTokenizer,
+        # vision_head_width, added this param for ViT-H
+        vision_head_width: int = 64,
     ):
         super().__init__()
 
         if isinstance(vision_layers, (tuple, list)):
-            vision_heads = vision_width * 32 // 64
+            vision_heads = vision_width * 32 // vision_head_width
             self.visual = ModifiedResNet(
                 layers=vision_layers,
                 output_dim=embed_dim,
@@ -361,7 +363,7 @@ class CLIP(nn.Module):
                 input_resolution=image_resolution,
                 width=vision_width)
         else:
-            vision_heads = vision_width // 64
+            vision_heads = vision_width // vision_head_width
             self.visual = VisualTransformer(
                 input_resolution=image_resolution,
                 patch_size=vision_patch_size,
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index 498c9ed8..70f8f11c 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -93,9 +93,8 @@ DEFAULT_MODEL_FOR_PIPELINE = {
                           'damo/cv_resnet50_live-category'),
     Tasks.video_category: (Pipelines.video_category,
                            'damo/cv_resnet50_video-category'),
-    Tasks.multi_modal_embedding:
-    (Pipelines.multi_modal_embedding,
-     'damo/multi-modal_clip-vit-large-patch14_zh'),
+    Tasks.multi_modal_embedding: (Pipelines.multi_modal_embedding,
+                                  'damo/multi-modal_clip-vit-base-patch16_zh'),
     Tasks.generative_multi_modal_embedding:
     (Pipelines.generative_multi_modal_embedding,
      'damo/multi-modal_gemm-vit-large-patch14_generative-multi-modal-embedding'

From 40b677095605594d426b9c731687fb834d04b4fc Mon Sep 17 00:00:00 2001
From: "liugao.lg" <liugao.lg@alibaba-inc.com>
Date: Tue, 1 Nov 2022 10:22:11 +0800
Subject: [PATCH 20/46] [to #42322933]fix ocr prepreocess & conflict
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

修复ocr预处理逻辑不一致问题
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10581697
---
 modelscope/preprocessors/multi_modal.py         |  1 -
 modelscope/preprocessors/ofa/ocr_recognition.py | 11 ++++++-----
 requirements/multi-modal.txt                    |  2 ++
 tests/trainers/test_ofa_trainer.py              |  2 +-
 4 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/modelscope/preprocessors/multi_modal.py b/modelscope/preprocessors/multi_modal.py
index 17dffb48..13876058 100644
--- a/modelscope/preprocessors/multi_modal.py
+++ b/modelscope/preprocessors/multi_modal.py
@@ -96,7 +96,6 @@ class OfaPreprocessor(Preprocessor):
             data = input
         else:
             data = self._build_dict(input)
-        data = self._ofa_input_compatibility_conversion(data)
         sample = self.preprocess(data)
         str_data = dict()
         for k, v in data.items():
diff --git a/modelscope/preprocessors/ofa/ocr_recognition.py b/modelscope/preprocessors/ofa/ocr_recognition.py
index 26fff9d2..a0342c14 100644
--- a/modelscope/preprocessors/ofa/ocr_recognition.py
+++ b/modelscope/preprocessors/ofa/ocr_recognition.py
@@ -2,12 +2,12 @@
 from typing import Any, Dict
 
 import torch
-from PIL import Image
+import unicodedata2
 from torchvision import transforms
 from torchvision.transforms import InterpolationMode
 from torchvision.transforms import functional as F
+from zhconv import convert
 
-from modelscope.preprocessors.image import load_image
 from modelscope.utils.constant import ModeKeys
 from .base import OfaBasePreprocessor
 
@@ -98,8 +98,7 @@ class OfaOcrRecognitionPreprocessor(OfaBasePreprocessor):
 
     def _build_train_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
         sample = self._build_infer_sample(data)
-        target = data[self.column_map['text']]
-        target = target.translate(self.transtab).strip()
+        target = sample['label']
         target_token_list = target.strip().split()
         target = ' '.join(target_token_list[:self.max_tgt_length])
         sample['target'] = self.tokenize_text(target, add_bos=False)
@@ -119,5 +118,7 @@ class OfaOcrRecognitionPreprocessor(OfaBasePreprocessor):
             'patch_mask': torch.tensor([True])
         }
         if 'text' in self.column_map and self.column_map['text'] in data:
-            sample['label'] = data[self.column_map['text']]
+            target = data[self.column_map['text']]
+            target = unicodedata2.normalize('NFKC', convert(target, 'zh-hans'))
+            sample['label'] = target
         return sample
diff --git a/requirements/multi-modal.txt b/requirements/multi-modal.txt
index 255f6155..578f0b54 100644
--- a/requirements/multi-modal.txt
+++ b/requirements/multi-modal.txt
@@ -11,3 +11,5 @@ timm
 tokenizers
 torchvision
 transformers>=4.12.0
+unicodedata2
+zhconv
diff --git a/tests/trainers/test_ofa_trainer.py b/tests/trainers/test_ofa_trainer.py
index 3f68a9fb..85c21881 100644
--- a/tests/trainers/test_ofa_trainer.py
+++ b/tests/trainers/test_ofa_trainer.py
@@ -85,7 +85,7 @@ class TestOfaTrainer(unittest.TestCase):
                 'ocr_fudanvi_zh',
                 subset_name='scene',
                 namespace='modelscope',
-                split='train[:200]',
+                split='train[800:900]',
                 download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS),
             eval_dataset=MsDataset.load(
                 'ocr_fudanvi_zh',

From f451ff8905e1615ec3adb3110fac89d8fe9bb492 Mon Sep 17 00:00:00 2001
From: "jiangyu.xzy" <jiangyu.xzy@alibaba-inc.com>
Date: Tue, 1 Nov 2022 11:22:46 +0800
Subject: [PATCH 21/46] api tagging for pipeline/train/evaluate

---
 modelscope/hub/api.py          | 24 ++++++++++++++++++++++++
 modelscope/pipelines/base.py   |  5 ++++-
 modelscope/trainers/trainer.py |  7 +++++++
 3 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py
index 7468e5e3..36c246f1 100644
--- a/modelscope/hub/api.py
+++ b/modelscope/hub/api.py
@@ -646,6 +646,30 @@ class HubApi:
     def check_local_cookies(self, use_cookies) -> CookieJar:
         return self._check_cookie(use_cookies=use_cookies)
 
+    def create_library_statistics(self,
+                                  method: str,
+                                  name: str,
+                                  cn_name: Optional[str]):
+        """
+            create library statistics. called by train()/evaluate()/pipeline()
+
+        Args:
+            method (str): called methed name,i.e train/evaluate/pipeline
+            name (str): model name, for example: damo/cv_unet_person-image-cartoon_compound-models
+            cn_name (str): model name in chinese, for example: 达摩卡通化模型
+        Raises:
+            ValueError: If user_cookies is True, but no local cookie.
+
+        Returns:
+            None
+        """
+        path = f'{self.endpoint}/api/v1/statistics/library'
+        headers = {'user-agent': ModelScopeConfig.get_user_agent()}
+        params = {"Method": method, "Name": name, "CnName": cn_name}
+        r = requests.post(path, params=params, headers=headers)
+        r.raise_for_status()
+        return
+
 
 class ModelScopeConfig:
     path_credential = expanduser(DEFAULT_CREDENTIALS_PATH)
diff --git a/modelscope/pipelines/base.py b/modelscope/pipelines/base.py
index bca80502..b8856dea 100644
--- a/modelscope/pipelines/base.py
+++ b/modelscope/pipelines/base.py
@@ -23,6 +23,7 @@ from modelscope.utils.hub import read_config, snapshot_download
 from modelscope.utils.import_utils import is_tf_available, is_torch_available
 from modelscope.utils.logger import get_logger
 from modelscope.utils.torch_utils import _find_free_port, _is_free_port
+from modelscope.hub.api import HubApi
 from .util import is_model, is_official_hub_path
 
 if is_torch_available():
@@ -151,7 +152,9 @@ class Pipeline(ABC):
                  **kwargs) -> Union[Dict[str, Any], Generator]:
         # model provider should leave it as it is
         # modelscope library developer will handle this function
-
+        _api = HubApi()
+        model_name = self.cfg.task
+        _api.create_library_statistics("pipeline", model_name, None)
         # place model to cpu or gpu
         if (self.model or (self.has_multiple_models and self.models[0])):
             if not self._model_prepare:
diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py
index 3556badf..6e5f4180 100644
--- a/modelscope/trainers/trainer.py
+++ b/modelscope/trainers/trainer.py
@@ -39,6 +39,7 @@ from modelscope.utils.logger import get_logger
 from modelscope.utils.registry import build_from_cfg
 from modelscope.utils.torch_utils import (get_dist_info, get_local_rank,
                                           init_dist, set_random_seed)
+from modelscope.hub.api import HubApi
 from .base import BaseTrainer
 from .builder import TRAINERS
 from .default_config import merge_cfg
@@ -436,6 +437,9 @@ class EpochBasedTrainer(BaseTrainer):
 
     def train(self, checkpoint_path=None, *args, **kwargs):
         self._mode = ModeKeys.TRAIN
+        _api = HubApi()
+        model_name = self.cfg.task
+        _api.create_library_statistics("train", model_name, None)
 
         if self.train_dataset is None:
             self.train_dataloader = self.get_train_dataloader()
@@ -456,6 +460,9 @@ class EpochBasedTrainer(BaseTrainer):
         self.train_loop(self.train_dataloader)
 
     def evaluate(self, checkpoint_path=None):
+        _api = HubApi()
+        model_name = self.cfg.task
+        _api.create_library_statistics("evaluate", model_name, None)
         if checkpoint_path is not None and os.path.isfile(checkpoint_path):
             from modelscope.trainers.hooks import CheckpointHook
             CheckpointHook.load_checkpoint(checkpoint_path, self)

From a79a900e94d2bff8fd4e3d8843ff065f35ca6096 Mon Sep 17 00:00:00 2001
From: "jiangyu.xzy" <jiangyu.xzy@alibaba-inc.com>
Date: Tue, 1 Nov 2022 11:35:28 +0800
Subject: [PATCH 22/46] change api to utils

---
 modelscope/hub/api.py          | 23 -----------------------
 modelscope/hub/utils/utils.py  | 13 +++++++++++++
 modelscope/pipelines/base.py   |  5 ++---
 modelscope/trainers/trainer.py |  8 +++-----
 4 files changed, 18 insertions(+), 31 deletions(-)

diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py
index 36c246f1..224c55ff 100644
--- a/modelscope/hub/api.py
+++ b/modelscope/hub/api.py
@@ -646,29 +646,6 @@ class HubApi:
     def check_local_cookies(self, use_cookies) -> CookieJar:
         return self._check_cookie(use_cookies=use_cookies)
 
-    def create_library_statistics(self,
-                                  method: str,
-                                  name: str,
-                                  cn_name: Optional[str]):
-        """
-            create library statistics. called by train()/evaluate()/pipeline()
-
-        Args:
-            method (str): called methed name,i.e train/evaluate/pipeline
-            name (str): model name, for example: damo/cv_unet_person-image-cartoon_compound-models
-            cn_name (str): model name in chinese, for example: 达摩卡通化模型
-        Raises:
-            ValueError: If user_cookies is True, but no local cookie.
-
-        Returns:
-            None
-        """
-        path = f'{self.endpoint}/api/v1/statistics/library'
-        headers = {'user-agent': ModelScopeConfig.get_user_agent()}
-        params = {"Method": method, "Name": name, "CnName": cn_name}
-        r = requests.post(path, params=params, headers=headers)
-        r.raise_for_status()
-        return
 
 
 class ModelScopeConfig:
diff --git a/modelscope/hub/utils/utils.py b/modelscope/hub/utils/utils.py
index a54f3413..8d5db579 100644
--- a/modelscope/hub/utils/utils.py
+++ b/modelscope/hub/utils/utils.py
@@ -4,6 +4,7 @@ import hashlib
 import os
 from datetime import datetime
 from typing import Optional
+import requests
 
 from modelscope.hub.constants import (DEFAULT_MODELSCOPE_DOMAIN,
                                       DEFAULT_MODELSCOPE_GROUP,
@@ -12,6 +13,7 @@ from modelscope.hub.constants import (DEFAULT_MODELSCOPE_DOMAIN,
 from modelscope.hub.errors import FileIntegrityError
 from modelscope.utils.file_utils import get_default_cache_dir
 from modelscope.utils.logger import get_logger
+from modelscope.hub.api import ModelScopeConfig
 
 logger = get_logger()
 
@@ -85,3 +87,14 @@ def file_integrity_validation(file_path, expected_sha256):
         msg = 'File %s integrity check failed, the download may be incomplete, please try again.' % file_path
         logger.error(msg)
         raise FileIntegrityError(msg)
+
+
+def create_library_statistics(method: str,
+                              name: str,
+                              cn_name: Optional[str]):
+    path = f'{get_endpoint()}/api/v1/statistics/library'
+    headers = {'user-agent': ModelScopeConfig.get_user_agent()}
+    params = {"Method": method, "Name": name, "CnName": cn_name}
+    r = requests.post(path, params=params, headers=headers)
+    r.raise_for_status()
+    return
diff --git a/modelscope/pipelines/base.py b/modelscope/pipelines/base.py
index b8856dea..a56ee934 100644
--- a/modelscope/pipelines/base.py
+++ b/modelscope/pipelines/base.py
@@ -23,7 +23,7 @@ from modelscope.utils.hub import read_config, snapshot_download
 from modelscope.utils.import_utils import is_tf_available, is_torch_available
 from modelscope.utils.logger import get_logger
 from modelscope.utils.torch_utils import _find_free_port, _is_free_port
-from modelscope.hub.api import HubApi
+from modelscope.hub.utils.utils import create_library_statistics
 from .util import is_model, is_official_hub_path
 
 if is_torch_available():
@@ -152,9 +152,8 @@ class Pipeline(ABC):
                  **kwargs) -> Union[Dict[str, Any], Generator]:
         # model provider should leave it as it is
         # modelscope library developer will handle this function
-        _api = HubApi()
         model_name = self.cfg.task
-        _api.create_library_statistics("pipeline", model_name, None)
+        create_library_statistics("pipeline", model_name, None)
         # place model to cpu or gpu
         if (self.model or (self.has_multiple_models and self.models[0])):
             if not self._model_prepare:
diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py
index 6e5f4180..92541252 100644
--- a/modelscope/trainers/trainer.py
+++ b/modelscope/trainers/trainer.py
@@ -39,7 +39,7 @@ from modelscope.utils.logger import get_logger
 from modelscope.utils.registry import build_from_cfg
 from modelscope.utils.torch_utils import (get_dist_info, get_local_rank,
                                           init_dist, set_random_seed)
-from modelscope.hub.api import HubApi
+from modelscope.hub.utils.utils import create_library_statistics
 from .base import BaseTrainer
 from .builder import TRAINERS
 from .default_config import merge_cfg
@@ -437,9 +437,8 @@ class EpochBasedTrainer(BaseTrainer):
 
     def train(self, checkpoint_path=None, *args, **kwargs):
         self._mode = ModeKeys.TRAIN
-        _api = HubApi()
         model_name = self.cfg.task
-        _api.create_library_statistics("train", model_name, None)
+        create_library_statistics("train", model_name, None)
 
         if self.train_dataset is None:
             self.train_dataloader = self.get_train_dataloader()
@@ -460,9 +459,8 @@ class EpochBasedTrainer(BaseTrainer):
         self.train_loop(self.train_dataloader)
 
     def evaluate(self, checkpoint_path=None):
-        _api = HubApi()
         model_name = self.cfg.task
-        _api.create_library_statistics("evaluate", model_name, None)
+        create_library_statistics("evaluate", model_name, None)
         if checkpoint_path is not None and os.path.isfile(checkpoint_path):
             from modelscope.trainers.hooks import CheckpointHook
             CheckpointHook.load_checkpoint(checkpoint_path, self)

From 60af6b701b453fdb09cf1f326f8cfac35fcfa27f Mon Sep 17 00:00:00 2001
From: "jiangyu.xzy" <jiangyu.xzy@alibaba-inc.com>
Date: Tue, 1 Nov 2022 11:59:59 +0800
Subject: [PATCH 23/46] fix task to model; handle exception

---
 modelscope/hub/utils/utils.py  | 13 ++++++++-----
 modelscope/pipelines/base.py   |  2 +-
 modelscope/trainers/trainer.py |  4 ++--
 3 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/modelscope/hub/utils/utils.py b/modelscope/hub/utils/utils.py
index 8d5db579..5c915998 100644
--- a/modelscope/hub/utils/utils.py
+++ b/modelscope/hub/utils/utils.py
@@ -92,9 +92,12 @@ def file_integrity_validation(file_path, expected_sha256):
 def create_library_statistics(method: str,
                               name: str,
                               cn_name: Optional[str]):
-    path = f'{get_endpoint()}/api/v1/statistics/library'
-    headers = {'user-agent': ModelScopeConfig.get_user_agent()}
-    params = {"Method": method, "Name": name, "CnName": cn_name}
-    r = requests.post(path, params=params, headers=headers)
-    r.raise_for_status()
+    try:
+        path = f'{get_endpoint()}/api/v1/statistics/library'
+        headers = {'user-agent': ModelScopeConfig.get_user_agent()}
+        params = {"Method": method, "Name": name, "CnName": cn_name}
+        r = requests.post(path, params=params, headers=headers)
+        r.raise_for_status()
+    except Exception:
+        pass
     return
diff --git a/modelscope/pipelines/base.py b/modelscope/pipelines/base.py
index a56ee934..9280cc09 100644
--- a/modelscope/pipelines/base.py
+++ b/modelscope/pipelines/base.py
@@ -152,7 +152,7 @@ class Pipeline(ABC):
                  **kwargs) -> Union[Dict[str, Any], Generator]:
         # model provider should leave it as it is
         # modelscope library developer will handle this function
-        model_name = self.cfg.task
+        model_name = self.cfg.model.type
         create_library_statistics("pipeline", model_name, None)
         # place model to cpu or gpu
         if (self.model or (self.has_multiple_models and self.models[0])):
diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py
index 92541252..522405ff 100644
--- a/modelscope/trainers/trainer.py
+++ b/modelscope/trainers/trainer.py
@@ -437,7 +437,7 @@ class EpochBasedTrainer(BaseTrainer):
 
     def train(self, checkpoint_path=None, *args, **kwargs):
         self._mode = ModeKeys.TRAIN
-        model_name = self.cfg.task
+        model_name = self.cfg.model.type
         create_library_statistics("train", model_name, None)
 
         if self.train_dataset is None:
@@ -459,7 +459,7 @@ class EpochBasedTrainer(BaseTrainer):
         self.train_loop(self.train_dataloader)
 
     def evaluate(self, checkpoint_path=None):
-        model_name = self.cfg.task
+        model_name = self.cfg.model.type
         create_library_statistics("evaluate", model_name, None)
         if checkpoint_path is not None and os.path.isfile(checkpoint_path):
             from modelscope.trainers.hooks import CheckpointHook

From 4080f8071e96d4dbcc5ae8af10b051e14fea30ac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8F=AD=E6=89=AC?= <xingjun.wxj@alibaba-inc.com>
Date: Tue, 1 Nov 2022 12:57:04 +0800
Subject: [PATCH 24/46] temp

---
 modelscope/hub/api.py               | 11 +++++++++++
 modelscope/msdatasets/ms_dataset.py | 14 ++++++++++++++
 2 files changed, 25 insertions(+)

diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py
index 7468e5e3..0262fc1d 100644
--- a/modelscope/hub/api.py
+++ b/modelscope/hub/api.py
@@ -646,6 +646,17 @@ class HubApi:
     def check_local_cookies(self, use_cookies) -> CookieJar:
         return self._check_cookie(use_cookies=use_cookies)
 
+    def count_uv_by_channel(self, dataset_name: str, namespace: str, channel: str):
+        # todo: 1. check args  2.
+
+        url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/download/uv/{channel}'
+        cookies = ModelScopeConfig.get_cookies()
+        r = requests.post(url, cookies=cookies, headers=self.headers)
+        resp = r.json()
+        raise_on_error(resp)
+        print(resp)
+        return resp['Message']
+
 
 class ModelScopeConfig:
     path_credential = expanduser(DEFAULT_CREDENTIALS_PATH)
diff --git a/modelscope/msdatasets/ms_dataset.py b/modelscope/msdatasets/ms_dataset.py
index 0c537df7..a7d29990 100644
--- a/modelscope/msdatasets/ms_dataset.py
+++ b/modelscope/msdatasets/ms_dataset.py
@@ -727,3 +727,17 @@ class MsDataset:
         resp_msg = _delete_manager.delete(object_name=object_name)
         logger.info(f'Object {object_name} successfully removed!')
         return resp_msg
+
+
+if __name__ == '__main__':
+    from modelscope.hub.api import HubApi
+    api = HubApi()
+    # api.login('c252d64a-ce7b-4c0c-b583-7bedf628c7da')  # online
+    # api.login('aa14716f-e2de-4f26-bf49-254d81eb8ac6')   # test
+
+    channel = 'local'  # dsw
+    dataset_name = 'small_coco_for_test'
+    namespace = 'wangxingjun778test'
+    resp = api.count_uv_by_channel(
+        dataset_name=dataset_name, namespace=namespace, channel=channel)
+    print(resp)

From f5c31b33198288405f209773cd41a5efa1991e50 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=B9=B2=E5=8A=B2?= <yuanzheng.yuanzhen@alibaba-inc.com>
Date: Tue, 1 Nov 2022 13:31:25 +0800
Subject: [PATCH 25/46] Add miss init

---
 .../models/science/unifold/modules/__init__.py     | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 modelscope/models/science/unifold/modules/__init__.py

diff --git a/modelscope/models/science/unifold/modules/__init__.py b/modelscope/models/science/unifold/modules/__init__.py
new file mode 100644
index 00000000..9821d212
--- /dev/null
+++ b/modelscope/models/science/unifold/modules/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Data pipeline for model features."""

From 943478de635393e957bb0bf6ad677fdd189ac5c1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=B9=B2=E5=8A=B2?= <yuanzheng.yuanzhen@alibaba-inc.com>
Date: Tue, 1 Nov 2022 13:32:57 +0800
Subject: [PATCH 26/46] Update

---
 .../models/science/unifold/modules/__init__.py  | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

diff --git a/modelscope/models/science/unifold/modules/__init__.py b/modelscope/models/science/unifold/modules/__init__.py
index 9821d212..63aa84ed 100644
--- a/modelscope/models/science/unifold/modules/__init__.py
+++ b/modelscope/models/science/unifold/modules/__init__.py
@@ -1,14 +1,3 @@
-# Copyright 2021 DeepMind Technologies Limited
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Data pipeline for model features."""
+# The Uni-fold implementation is also open-sourced by the authors under Apache-2.0 license,
+# and is publicly available at https://github.com/dptech-corp/Uni-Fold.
+"""Unifold Modules."""

From 2759d538bb30c8c82d0dd32ea3b4bcd7606d41d3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=B9=B2=E5=8A=B2?= <yuanzheng.yuanzhen@alibaba-inc.com>
Date: Tue, 1 Nov 2022 14:59:45 +0800
Subject: [PATCH 27/46] fix ut level for unifold

---
 tests/pipelines/test_unifold.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/pipelines/test_unifold.py b/tests/pipelines/test_unifold.py
index df35dc5e..47bb7874 100644
--- a/tests/pipelines/test_unifold.py
+++ b/tests/pipelines/test_unifold.py
@@ -19,7 +19,7 @@ class UnifoldProteinStructureTest(unittest.TestCase, DemoCompatibilityCheck):
         self.protein_multimer = 'GAMGLPEEPSSPQESTLKALSLYEAHLSSYIMYLQTFLVKTKQKVNNKNYPEFTLFDTSKLKKDQTLKSIKT' + \
             'NIAALKNHIDKIKPIAMQIYKKYSKNIP'
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_by_direct_model_download(self):
         model_dir = snapshot_download(self.model_id)
         mono_pipeline_ins = pipeline(task=self.task, model=model_dir)

From cc76d900bcf2a7aae0a41d02d861f1865aba4b2c Mon Sep 17 00:00:00 2001
From: "jiangyu.xzy" <jiangyu.xzy@alibaba-inc.com>
Date: Tue, 1 Nov 2022 15:31:08 +0800
Subject: [PATCH 28/46] add model name to baseModel. use model name as tag

---
 modelscope/hub/t_jy.py               | 16 ++++++++++++++++
 modelscope/models/base/base_model.py |  2 ++
 modelscope/pipelines/base.py         |  5 +++--
 modelscope/trainers/trainer.py       |  8 ++++----
 4 files changed, 25 insertions(+), 6 deletions(-)
 create mode 100644 modelscope/hub/t_jy.py

diff --git a/modelscope/hub/t_jy.py b/modelscope/hub/t_jy.py
new file mode 100644
index 00000000..baf84f46
--- /dev/null
+++ b/modelscope/hub/t_jy.py
@@ -0,0 +1,16 @@
+def dec(param1):
+    print(param1)
+
+    def in_dec(func):
+        def in_func(name):
+            return func(name)
+        return in_func
+    return in_dec
+
+
+@dec("dec1")
+def aa(param):
+    print(param)
+    return
+
+aa("heell")
\ No newline at end of file
diff --git a/modelscope/models/base/base_model.py b/modelscope/models/base/base_model.py
index 1ca7e030..721478c3 100644
--- a/modelscope/models/base/base_model.py
+++ b/modelscope/models/base/base_model.py
@@ -131,6 +131,8 @@ class Model(ABC):
 
         if not hasattr(model, 'cfg'):
             model.cfg = cfg
+
+        model.name = model_name_or_path
         return model
 
     def save_pretrained(self,
diff --git a/modelscope/pipelines/base.py b/modelscope/pipelines/base.py
index 9280cc09..b9a4a25c 100644
--- a/modelscope/pipelines/base.py
+++ b/modelscope/pipelines/base.py
@@ -152,8 +152,9 @@ class Pipeline(ABC):
                  **kwargs) -> Union[Dict[str, Any], Generator]:
         # model provider should leave it as it is
         # modelscope library developer will handle this function
-        model_name = self.cfg.model.type
-        create_library_statistics("pipeline", model_name, None)
+        for single_model in self.models:
+            if hasattr(single_model, 'name'):
+                create_library_statistics("pipeline", single_model.name, None)
         # place model to cpu or gpu
         if (self.model or (self.has_multiple_models and self.models[0])):
             if not self._model_prepare:
diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py
index 522405ff..2e79667f 100644
--- a/modelscope/trainers/trainer.py
+++ b/modelscope/trainers/trainer.py
@@ -437,8 +437,8 @@ class EpochBasedTrainer(BaseTrainer):
 
     def train(self, checkpoint_path=None, *args, **kwargs):
         self._mode = ModeKeys.TRAIN
-        model_name = self.cfg.model.type
-        create_library_statistics("train", model_name, None)
+        if hasattr(self.model, 'name'):
+            create_library_statistics("train", self.model.name, None)
 
         if self.train_dataset is None:
             self.train_dataloader = self.get_train_dataloader()
@@ -459,8 +459,8 @@ class EpochBasedTrainer(BaseTrainer):
         self.train_loop(self.train_dataloader)
 
     def evaluate(self, checkpoint_path=None):
-        model_name = self.cfg.model.type
-        create_library_statistics("evaluate", model_name, None)
+        if hasattr(self.model, 'name'):
+            create_library_statistics("evaluate", self.model.name, None)
         if checkpoint_path is not None and os.path.isfile(checkpoint_path):
             from modelscope.trainers.hooks import CheckpointHook
             CheckpointHook.load_checkpoint(checkpoint_path, self)

From 184c35f80031574d53019124d56637ddfca4aa66 Mon Sep 17 00:00:00 2001
From: "jiangyu.xzy" <jiangyu.xzy@alibaba-inc.com>
Date: Tue, 1 Nov 2022 15:32:04 +0800
Subject: [PATCH 29/46] rm useless

---
 modelscope/hub/t_jy.py | 16 ----------------
 1 file changed, 16 deletions(-)
 delete mode 100644 modelscope/hub/t_jy.py

diff --git a/modelscope/hub/t_jy.py b/modelscope/hub/t_jy.py
deleted file mode 100644
index baf84f46..00000000
--- a/modelscope/hub/t_jy.py
+++ /dev/null
@@ -1,16 +0,0 @@
-def dec(param1):
-    print(param1)
-
-    def in_dec(func):
-        def in_func(name):
-            return func(name)
-        return in_func
-    return in_dec
-
-
-@dec("dec1")
-def aa(param):
-    print(param)
-    return
-
-aa("heell")
\ No newline at end of file

From 84032f90e3f2b4a183725ceda16a4b1dc204c2f3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8F=AD=E6=89=AC?= <xingjun.wxj@alibaba-inc.com>
Date: Tue, 1 Nov 2022 15:34:58 +0800
Subject: [PATCH 30/46] add event tracking

---
 modelscope/hub/api.py               | 20 ++++++++++++++------
 modelscope/msdatasets/ms_dataset.py | 16 ++--------------
 modelscope/utils/constant.py        |  8 ++++++++
 requirements/framework.txt          |  2 +-
 4 files changed, 25 insertions(+), 21 deletions(-)

diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py
index 0262fc1d..f2ff822d 100644
--- a/modelscope/hub/api.py
+++ b/modelscope/hub/api.py
@@ -39,8 +39,8 @@ from modelscope.utils.constant import (DEFAULT_DATASET_REVISION,
                                        DEFAULT_MODEL_REVISION,
                                        DEFAULT_REPOSITORY_REVISION,
                                        MASTER_MODEL_BRANCH, DatasetFormations,
-                                       DatasetMetaFormats, DownloadMode,
-                                       ModelFile)
+                                       DatasetMetaFormats, DownloadChannel,
+                                       DownloadMode, ModelFile)
 from modelscope.utils.logger import get_logger
 from .utils.utils import (get_endpoint, get_release_datetime,
                           model_id_to_group_owner_name)
@@ -646,15 +646,23 @@ class HubApi:
     def check_local_cookies(self, use_cookies) -> CookieJar:
         return self._check_cookie(use_cookies=use_cookies)
 
-    def count_uv_by_channel(self, dataset_name: str, namespace: str, channel: str):
-        # todo: 1. check args  2.
+    def dataset_download_uv(self, dataset_name: str, namespace: str):
+        if not dataset_name or not namespace:
+            raise ValueError('dataset_name or namespace cannot be empty!')
 
-        url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/download/uv/{channel}'
+        # get channel and user_name
+        channel = DownloadChannel.LOCAL.value
+        user_name = ''
+        if MODELSCOPE_ENVIRONMENT in os.environ:
+            channel = os.environ[MODELSCOPE_ENVIRONMENT]
+        if MODELSCOPE_USERNAME in os.environ:
+            user_name = os.environ[MODELSCOPE_USERNAME]
+
+        url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/download/uv/{channel}?user={user_name}'
         cookies = ModelScopeConfig.get_cookies()
         r = requests.post(url, cookies=cookies, headers=self.headers)
         resp = r.json()
         raise_on_error(resp)
-        print(resp)
         return resp['Message']
 
 
diff --git a/modelscope/msdatasets/ms_dataset.py b/modelscope/msdatasets/ms_dataset.py
index a7d29990..5c8ea59f 100644
--- a/modelscope/msdatasets/ms_dataset.py
+++ b/modelscope/msdatasets/ms_dataset.py
@@ -274,6 +274,8 @@ class MsDataset:
             try:
                 api.on_dataset_download(
                     dataset_name=download_dataset, namespace=namespace)
+                api.dataset_download_uv(
+                    dataset_name=download_dataset, namespace=namespace)
             except Exception as e:
                 logger.error(e)
 
@@ -727,17 +729,3 @@ class MsDataset:
         resp_msg = _delete_manager.delete(object_name=object_name)
         logger.info(f'Object {object_name} successfully removed!')
         return resp_msg
-
-
-if __name__ == '__main__':
-    from modelscope.hub.api import HubApi
-    api = HubApi()
-    # api.login('c252d64a-ce7b-4c0c-b583-7bedf628c7da')  # online
-    # api.login('aa14716f-e2de-4f26-bf49-254d81eb8ac6')   # test
-
-    channel = 'local'  # dsw
-    dataset_name = 'small_coco_for_test'
-    namespace = 'wangxingjun778test'
-    resp = api.count_uv_by_channel(
-        dataset_name=dataset_name, namespace=namespace, channel=channel)
-    print(resp)
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 2729b75a..f0a97dbd 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -238,6 +238,14 @@ class DownloadMode(enum.Enum):
     FORCE_REDOWNLOAD = 'force_redownload'
 
 
+class DownloadChannel(enum.Enum):
+    """ Channels of datasets downloading for uv/pv counting.
+    """
+    LOCAL = 'local'
+    DSW = 'dsw'
+    EAIS = 'eais'
+
+
 class UploadMode(enum.Enum):
     """ How to upload object to remote.
     """
diff --git a/requirements/framework.txt b/requirements/framework.txt
index 17fbd8a3..e78bc9a9 100644
--- a/requirements/framework.txt
+++ b/requirements/framework.txt
@@ -1,7 +1,7 @@
 addict
 attrs
 # version beyond 2.6.0 introduces compatbility issue and is being resolved
-datasets<=2.6.0
+datasets<=2.5.2
 easydict
 einops
 filelock>=3.3.0

From 79c44a68102e182b3194e3b9e6244d4891859274 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8F=AD=E6=89=AC?= <xingjun.wxj@alibaba-inc.com>
Date: Tue, 1 Nov 2022 15:41:01 +0800
Subject: [PATCH 31/46] add event tracking

---
 requirements/framework.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/framework.txt b/requirements/framework.txt
index e78bc9a9..a86c0cc5 100644
--- a/requirements/framework.txt
+++ b/requirements/framework.txt
@@ -1,6 +1,6 @@
 addict
 attrs
-# version beyond 2.6.0 introduces compatbility issue and is being resolved
+# version beyond 2.5.2 introduces compatbility issue and is being resolved
 datasets<=2.5.2
 easydict
 einops

From 63a08e7be68bce218eb6ca755ecbc821017d83b2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8F=AD=E6=89=AC?= <xingjun.wxj@alibaba-inc.com>
Date: Tue, 1 Nov 2022 15:49:21 +0800
Subject: [PATCH 32/46] add event tracking

---
 tests/msdatasets/test_dataset_upload.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/msdatasets/test_dataset_upload.py b/tests/msdatasets/test_dataset_upload.py
index 3d35d480..b67c2ebb 100644
--- a/tests/msdatasets/test_dataset_upload.py
+++ b/tests/msdatasets/test_dataset_upload.py
@@ -104,7 +104,11 @@ class DatasetUploadTest(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_ds_download_dir(self):
-        test_ds = MsDataset.load(self.dataset_name, self.namespace)
+        from modelscope.utils.constant import DownloadMode
+        test_ds = MsDataset.load(
+            self.dataset_name,
+            namespace=self.namespace,
+            download_mode=DownloadMode.FORCE_REDOWNLOAD)
         assert test_ds.config_kwargs['split_config'].values()
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')

From e45ab2c32d66a3ae8014be045d773719b82cb0cc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8F=AD=E6=89=AC?= <xingjun.wxj@alibaba-inc.com>
Date: Tue, 1 Nov 2022 15:51:00 +0800
Subject: [PATCH 33/46] add event tracking

---
 tests/msdatasets/test_dataset_upload.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/msdatasets/test_dataset_upload.py b/tests/msdatasets/test_dataset_upload.py
index b67c2ebb..d91f24d7 100644
--- a/tests/msdatasets/test_dataset_upload.py
+++ b/tests/msdatasets/test_dataset_upload.py
@@ -8,7 +8,8 @@ import zipfile
 from modelscope.msdatasets import MsDataset
 from modelscope.msdatasets.utils.dataset_utils import list_dataset_objects
 from modelscope.utils import logger as logging
-from modelscope.utils.constant import DEFAULT_DATASET_REVISION, ModelFile
+from modelscope.utils.constant import (DEFAULT_DATASET_REVISION, DownloadMode,
+                                       ModelFile)
 from modelscope.utils.test_utils import test_level
 
 logger = logging.get_logger(__name__)
@@ -104,7 +105,6 @@ class DatasetUploadTest(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_ds_download_dir(self):
-        from modelscope.utils.constant import DownloadMode
         test_ds = MsDataset.load(
             self.dataset_name,
             namespace=self.namespace,

From 5f3c9433fc83bc13fb00d552270e5dc8d6933854 Mon Sep 17 00:00:00 2001
From: "jiangyu.xzy" <jiangyu.xzy@alibaba-inc.com>
Date: Tue, 1 Nov 2022 16:35:46 +0800
Subject: [PATCH 34/46] fix format

---
 modelscope/hub/api.py         | 1 -
 modelscope/hub/utils/utils.py | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py
index 224c55ff..7468e5e3 100644
--- a/modelscope/hub/api.py
+++ b/modelscope/hub/api.py
@@ -647,7 +647,6 @@ class HubApi:
         return self._check_cookie(use_cookies=use_cookies)
 
 
-
 class ModelScopeConfig:
     path_credential = expanduser(DEFAULT_CREDENTIALS_PATH)
     COOKIES_FILE_NAME = 'cookies'
diff --git a/modelscope/hub/utils/utils.py b/modelscope/hub/utils/utils.py
index 5c915998..312647c2 100644
--- a/modelscope/hub/utils/utils.py
+++ b/modelscope/hub/utils/utils.py
@@ -95,7 +95,7 @@ def create_library_statistics(method: str,
     try:
         path = f'{get_endpoint()}/api/v1/statistics/library'
         headers = {'user-agent': ModelScopeConfig.get_user_agent()}
-        params = {"Method": method, "Name": name, "CnName": cn_name}
+        params = {'Method': method, 'Name': name, 'CnName': cn_name}
         r = requests.post(path, params=params, headers=headers)
         r.raise_for_status()
     except Exception:

From 76bb518d75818ce8e19afa0f0b775b00ac9a72cd Mon Sep 17 00:00:00 2001
From: "jiangyu.xzy" <jiangyu.xzy@alibaba-inc.com>
Date: Tue, 1 Nov 2022 16:59:47 +0800
Subject: [PATCH 35/46] fix format

---
 modelscope/hub/utils/utils.py  | 8 +++-----
 modelscope/trainers/trainer.py | 2 +-
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/modelscope/hub/utils/utils.py b/modelscope/hub/utils/utils.py
index 312647c2..f9a75cce 100644
--- a/modelscope/hub/utils/utils.py
+++ b/modelscope/hub/utils/utils.py
@@ -2,10 +2,11 @@
 
 import hashlib
 import os
+import requests
 from datetime import datetime
 from typing import Optional
-import requests
 
+from modelscope.hub.api import ModelScopeConfig
 from modelscope.hub.constants import (DEFAULT_MODELSCOPE_DOMAIN,
                                       DEFAULT_MODELSCOPE_GROUP,
                                       MODEL_ID_SEPARATOR, MODELSCOPE_SDK_DEBUG,
@@ -13,7 +14,6 @@ from modelscope.hub.constants import (DEFAULT_MODELSCOPE_DOMAIN,
 from modelscope.hub.errors import FileIntegrityError
 from modelscope.utils.file_utils import get_default_cache_dir
 from modelscope.utils.logger import get_logger
-from modelscope.hub.api import ModelScopeConfig
 
 logger = get_logger()
 
@@ -89,9 +89,7 @@ def file_integrity_validation(file_path, expected_sha256):
         raise FileIntegrityError(msg)
 
 
-def create_library_statistics(method: str,
-                              name: str,
-                              cn_name: Optional[str]):
+def create_library_statistics(method: str, name: str, cn_name: Optional[str]):
     try:
         path = f'{get_endpoint()}/api/v1/statistics/library'
         headers = {'user-agent': ModelScopeConfig.get_user_agent()}
diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py
index 2e79667f..d59c3dfc 100644
--- a/modelscope/trainers/trainer.py
+++ b/modelscope/trainers/trainer.py
@@ -14,6 +14,7 @@ from torch.utils.data import DataLoader, Dataset
 from torch.utils.data.dataloader import default_collate
 from torch.utils.data.distributed import DistributedSampler
 
+from modelscope.hub.utils.utils import create_library_statistics
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.metainfo import Trainers
 from modelscope.metrics import build_metric, task_default_metrics
@@ -39,7 +40,6 @@ from modelscope.utils.logger import get_logger
 from modelscope.utils.registry import build_from_cfg
 from modelscope.utils.torch_utils import (get_dist_info, get_local_rank,
                                           init_dist, set_random_seed)
-from modelscope.hub.utils.utils import create_library_statistics
 from .base import BaseTrainer
 from .builder import TRAINERS
 from .default_config import merge_cfg

From 30c8c27145261a3e5c7606976e11faef733d3f49 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=B9=B2=E5=8A=B2?= <yuanzheng.yuanzhen@alibaba-inc.com>
Date: Tue, 1 Nov 2022 17:06:30 +0800
Subject: [PATCH 36/46] up requirements

---
 requirements/science.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/requirements/science.txt b/requirements/science.txt
index 72994f72..c345da99 100644
--- a/requirements/science.txt
+++ b/requirements/science.txt
@@ -4,3 +4,5 @@ ml_collections
 scipy
 tensorboardX
 tokenizers
+biopython
+ipdb

From 853e5235d56bf35922cde0db843cb62353e19a39 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=B9=B2=E5=8A=B2?= <yuanzheng.yuanzhen@alibaba-inc.com>
Date: Tue, 1 Nov 2022 17:32:04 +0800
Subject: [PATCH 37/46] fix requirements

---
 requirements/science.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/requirements/science.txt b/requirements/science.txt
index c345da99..636f98f4 100644
--- a/requirements/science.txt
+++ b/requirements/science.txt
@@ -1,8 +1,8 @@
-iopath
+biopython
 lmdb
 ml_collections
 scipy
 tensorboardX
 tokenizers
-biopython
-ipdb
+iopath
+ipdb
\ No newline at end of file

From 9ae5b67204e5648eb54e1ea43ca741623c87e1da Mon Sep 17 00:00:00 2001
From: "mulin.lyh" <mulin.lyh@taobao.com>
Date: Tue, 1 Nov 2022 17:40:28 +0800
Subject: [PATCH 38/46] fix style issues

---
 modelscope/hub/utils/utils.py  | 3 ++-
 modelscope/pipelines/base.py   | 4 ++--
 modelscope/trainers/trainer.py | 6 +++---
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/modelscope/hub/utils/utils.py b/modelscope/hub/utils/utils.py
index f9a75cce..d0a87cbd 100644
--- a/modelscope/hub/utils/utils.py
+++ b/modelscope/hub/utils/utils.py
@@ -2,10 +2,11 @@
 
 import hashlib
 import os
-import requests
 from datetime import datetime
 from typing import Optional
 
+import requests
+
 from modelscope.hub.api import ModelScopeConfig
 from modelscope.hub.constants import (DEFAULT_MODELSCOPE_DOMAIN,
                                       DEFAULT_MODELSCOPE_GROUP,
diff --git a/modelscope/pipelines/base.py b/modelscope/pipelines/base.py
index b9a4a25c..68010012 100644
--- a/modelscope/pipelines/base.py
+++ b/modelscope/pipelines/base.py
@@ -10,6 +10,7 @@ from typing import Any, Dict, Generator, List, Mapping, Union
 
 import numpy as np
 
+from modelscope.hub.utils.utils import create_library_statistics
 from modelscope.models.base import Model
 from modelscope.msdatasets import MsDataset
 from modelscope.outputs import TASK_OUTPUTS
@@ -23,7 +24,6 @@ from modelscope.utils.hub import read_config, snapshot_download
 from modelscope.utils.import_utils import is_tf_available, is_torch_available
 from modelscope.utils.logger import get_logger
 from modelscope.utils.torch_utils import _find_free_port, _is_free_port
-from modelscope.hub.utils.utils import create_library_statistics
 from .util import is_model, is_official_hub_path
 
 if is_torch_available():
@@ -154,7 +154,7 @@ class Pipeline(ABC):
         # modelscope library developer will handle this function
         for single_model in self.models:
             if hasattr(single_model, 'name'):
-                create_library_statistics("pipeline", single_model.name, None)
+                create_library_statistics('pipeline', single_model.name, None)
         # place model to cpu or gpu
         if (self.model or (self.has_multiple_models and self.models[0])):
             if not self._model_prepare:
diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py
index d59c3dfc..12c25f30 100644
--- a/modelscope/trainers/trainer.py
+++ b/modelscope/trainers/trainer.py
@@ -14,8 +14,8 @@ from torch.utils.data import DataLoader, Dataset
 from torch.utils.data.dataloader import default_collate
 from torch.utils.data.distributed import DistributedSampler
 
-from modelscope.hub.utils.utils import create_library_statistics
 from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.hub.utils.utils import create_library_statistics
 from modelscope.metainfo import Trainers
 from modelscope.metrics import build_metric, task_default_metrics
 from modelscope.models.base import Model, TorchModel
@@ -438,7 +438,7 @@ class EpochBasedTrainer(BaseTrainer):
     def train(self, checkpoint_path=None, *args, **kwargs):
         self._mode = ModeKeys.TRAIN
         if hasattr(self.model, 'name'):
-            create_library_statistics("train", self.model.name, None)
+            create_library_statistics('train', self.model.name, None)
 
         if self.train_dataset is None:
             self.train_dataloader = self.get_train_dataloader()
@@ -460,7 +460,7 @@ class EpochBasedTrainer(BaseTrainer):
 
     def evaluate(self, checkpoint_path=None):
         if hasattr(self.model, 'name'):
-            create_library_statistics("evaluate", self.model.name, None)
+            create_library_statistics('evaluate', self.model.name, None)
         if checkpoint_path is not None and os.path.isfile(checkpoint_path):
             from modelscope.trainers.hooks import CheckpointHook
             CheckpointHook.load_checkpoint(checkpoint_path, self)

From 420b63f03b55d5c2a591fd69cd060ed3a8141ef4 Mon Sep 17 00:00:00 2001
From: "mulin.lyh" <mulin.lyh@taobao.com>
Date: Tue, 1 Nov 2022 17:44:18 +0800
Subject: [PATCH 39/46] fix style issues

---
 requirements/science.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements/science.txt b/requirements/science.txt
index 636f98f4..c30ff644 100644
--- a/requirements/science.txt
+++ b/requirements/science.txt
@@ -1,8 +1,8 @@
 biopython
+iopath
+ipdb
 lmdb
 ml_collections
 scipy
 tensorboardX
 tokenizers
-iopath
-ipdb
\ No newline at end of file

From aecb88044eba1789a675f22a32cc6f2eed71b91a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=B9=B2=E5=8A=B2?= <yuanzheng.yuanzhen@alibaba-inc.com>
Date: Tue, 1 Nov 2022 17:44:37 +0800
Subject: [PATCH 40/46] up

---
 requirements/science.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements/science.txt b/requirements/science.txt
index 636f98f4..c30ff644 100644
--- a/requirements/science.txt
+++ b/requirements/science.txt
@@ -1,8 +1,8 @@
 biopython
+iopath
+ipdb
 lmdb
 ml_collections
 scipy
 tensorboardX
 tokenizers
-iopath
-ipdb
\ No newline at end of file

From f2faf3acb38e3ccb6e62379e4314f00c844db36f Mon Sep 17 00:00:00 2001
From: "jiangyu.xzy" <jiangyu.xzy@alibaba-inc.com>
Date: Tue, 1 Nov 2022 18:04:48 +0800
Subject: [PATCH 41/46] fix import bug

---
 modelscope/hub/utils/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modelscope/hub/utils/utils.py b/modelscope/hub/utils/utils.py
index d0a87cbd..61d560fa 100644
--- a/modelscope/hub/utils/utils.py
+++ b/modelscope/hub/utils/utils.py
@@ -7,7 +7,6 @@ from typing import Optional
 
 import requests
 
-from modelscope.hub.api import ModelScopeConfig
 from modelscope.hub.constants import (DEFAULT_MODELSCOPE_DOMAIN,
                                       DEFAULT_MODELSCOPE_GROUP,
                                       MODEL_ID_SEPARATOR, MODELSCOPE_SDK_DEBUG,
@@ -92,6 +91,7 @@ def file_integrity_validation(file_path, expected_sha256):
 
 def create_library_statistics(method: str, name: str, cn_name: Optional[str]):
     try:
+        from modelscope.hub.api import ModelScopeConfig
         path = f'{get_endpoint()}/api/v1/statistics/library'
         headers = {'user-agent': ModelScopeConfig.get_user_agent()}
         params = {'Method': method, 'Name': name, 'CnName': cn_name}

From e870d55e28b97732686849a22084ed7dca4c2182 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=AF=BF=E5=B7=9E?= <shouzhou.bx@alibaba-inc.com>
Date: Tue, 1 Nov 2022 20:31:16 +0800
Subject: [PATCH 42/46] fix no face bug and adaptive for 360 degree of head

---
 .../face_2d_keypoints_pipeline.py             | 136 +++++++-----------
 1 file changed, 53 insertions(+), 83 deletions(-)

diff --git a/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py b/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py
index 4de5a4f2..94cbb74e 100644
--- a/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py
+++ b/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py
@@ -12,8 +12,11 @@ from modelscope.pipelines import pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import LoadImage
 from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
 from .base import EasyCVPipeline
 
+logger = get_logger()
+
 
 @PIPELINES.register_module(
     Tasks.face_2d_keypoints, module_name=Pipelines.face_2d_keypoints)
@@ -123,54 +126,28 @@ class Face2DKeypointsPipeline(EasyCVPipeline):
             return s / 3 * sigma
 
     def rotate_crop_img(self, img, pts, M):
-        image_size = 256
-        enlarge_ratio = 1.1
-
         imgT = cv2.warpAffine(img, M, (int(img.shape[1]), int(img.shape[0])))
 
         x1 = pts[5][0]
+        x2 = pts[5][0]
         y1 = pts[5][1]
-        x2 = pts[6][0]
-        y2 = pts[6][1]
-        w = x2 - x1 + 1
-        h = y2 - y1 + 1
-        x1 = int(x1 - (enlarge_ratio - 1.0) / 2.0 * w)
-        y1 = int(y1 - (enlarge_ratio - 1.0) / 2.0 * h)
-
-        new_w = int(enlarge_ratio * (1 + self.random_normal() * 0.1) * w)
-        new_h = int(enlarge_ratio * (1 + self.random_normal() * 0.1) * h)
-        new_x1 = x1 + int(self.random_normal() * image_size * 0.05)
-        new_y1 = y1 + int(self.random_normal() * image_size * 0.05)
-        new_x2 = new_x1 + new_w
-        new_y2 = new_y1 + new_h
+        y2 = pts[5][1]
+        for i in range(0, 9):
+            x1 = min(x1, pts[i][0])
+            x2 = max(x2, pts[i][0])
+            y1 = min(y1, pts[i][1])
+            y2 = max(y2, pts[i][1])
 
         height, width, _ = imgT.shape
-        dx = max(0, -new_x1)
-        dy = max(0, -new_y1)
-        new_x1 = max(0, new_x1)
-        new_y1 = max(0, new_y1)
+        x1 = min(max(0, int(x1)), width)
+        y1 = min(max(0, int(y1)), height)
+        x2 = min(max(0, int(x2)), width)
+        y2 = min(max(0, int(y2)), height)
+        sub_imgT = imgT[y1:y2, x1:x2]
 
-        edx = max(0, new_x2 - width)
-        edy = max(0, new_y2 - height)
-        new_x2 = min(width, new_x2)
-        new_y2 = min(height, new_y2)
+        return sub_imgT, imgT, [x1, y1, x2, y2]
 
-        sub_imgT = imgT[new_y1:new_y2, new_x1:new_x2]
-        if dx > 0 or dy > 0 or edx > 0 or edy > 0:
-            sub_imgT = cv2.copyMakeBorder(
-                sub_imgT,
-                dy,
-                edy,
-                dx,
-                edx,
-                cv2.BORDER_CONSTANT,
-                value=(103.94, 116.78, 123.68))
-
-        return sub_imgT, imgT, [new_x1, new_y1, new_x2,
-                                new_y2], [dx, dy, edx, edy]
-
-    def crop_img(self, imgT, pts, angle):
-        image_size = 256
+    def crop_img(self, imgT, pts):
         enlarge_ratio = 1.1
 
         x1 = np.min(pts[:, 0])
@@ -181,94 +158,87 @@ class Face2DKeypointsPipeline(EasyCVPipeline):
         h = y2 - y1 + 1
         x1 = int(x1 - (enlarge_ratio - 1.0) / 2.0 * w)
         y1 = int(y1 - (enlarge_ratio - 1.0) / 2.0 * h)
+        x1 = max(0, x1)
+        y1 = max(0, y1)
 
-        new_w = int(enlarge_ratio * (1 + self.random_normal() * 0.1) * w)
-        new_h = int(enlarge_ratio * (1 + self.random_normal() * 0.1) * h)
-        new_x1 = x1 + int(self.random_normal() * image_size * 0.05)
-        new_y1 = y1 + int(self.random_normal() * image_size * 0.05)
+        new_w = int(enlarge_ratio * w)
+        new_h = int(enlarge_ratio * h)
+        new_x1 = x1
+        new_y1 = y1
         new_x2 = new_x1 + new_w
         new_y2 = new_y1 + new_h
 
-        new_xy = new_x1, new_y1
-        pts = pts - new_xy
-
         height, width, _ = imgT.shape
-        dx = max(0, -new_x1)
-        dy = max(0, -new_y1)
-        new_x1 = max(0, new_x1)
-        new_y1 = max(0, new_y1)
 
-        edx = max(0, new_x2 - width)
-        edy = max(0, new_y2 - height)
-        new_x2 = min(width, new_x2)
-        new_y2 = min(height, new_y2)
+        new_x1 = min(max(0, new_x1), width)
+        new_y1 = min(max(0, new_y1), height)
+        new_x2 = max(min(width, new_x2), 0)
+        new_y2 = max(min(height, new_y2), 0)
 
         sub_imgT = imgT[new_y1:new_y2, new_x1:new_x2]
-        if dx > 0 or dy > 0 or edx > 0 or edy > 0:
-            sub_imgT = cv2.copyMakeBorder(
-                sub_imgT,
-                dy,
-                edy,
-                dx,
-                edx,
-                cv2.BORDER_CONSTANT,
-                value=(103.94, 116.78, 123.68))
-
-        return sub_imgT, [new_x1, new_y1, new_x2, new_y2], [dx, dy, edx, edy]
 
-    def __call__(self, inputs) -> Any:
-        image_size = 256
+        return sub_imgT, [new_x1, new_y1, new_x2, new_y2]
 
+    def __call__(self, inputs) -> Any:
         img = LoadImage.convert_to_ndarray(inputs)
         h, w, c = img.shape
         img_rgb = copy.deepcopy(img)
         img_rgb = img_rgb[:, :, ::-1]
         det_result = self.face_detection(img_rgb)
+
+        bboxes = np.array(det_result[OutputKeys.BOXES])
+        if bboxes.shape[0] == 0:
+            logger.warn('No face detected!')
+            results = {
+                OutputKeys.KEYPOINTS: [],
+                OutputKeys.POSES: [],
+                OutputKeys.BOXES: []
+            }
+            return results
+
         boxes, keypoints = self._choose_face(det_result)
 
         output_boxes = []
         output_keypoints = []
         output_poses = []
-        for idx, box_ori in enumerate(boxes):
-            box = self.expend_box(box_ori, w, h, scalex=0.15, scaley=0.15)
+        for index, box_ori in enumerate(boxes):
+            box = self.expend_box(box_ori, w, h, scalex=0.1, scaley=0.1)
             y0 = int(box[1])
             y1 = int(box[3])
             x0 = int(box[0])
             x1 = int(box[2])
             sub_img = img[y0:y1, x0:x1]
 
-            keypoint = keypoints[idx]
+            keypoint = keypoints[index]
             pts = [[keypoint[0], keypoint[1]], [keypoint[2], keypoint[3]],
                    [keypoint[4], keypoint[5]], [keypoint[6], keypoint[7]],
                    [keypoint[8], keypoint[9]], [box[0], box[1]],
-                   [box[2], box[3]]]
+                   [box[2], box[1]], [box[0], box[3]], [box[2], box[3]]]
             # radian
             angle = math.atan2((pts[1][1] - pts[0][1]),
                                (pts[1][0] - pts[0][0]))
             # angle
             theta = angle * (180 / np.pi)
 
-            center = [image_size // 2, image_size // 2]
+            center = [w // 2, h // 2]
             cx, cy = center
             M, landmark_ = self.rotate_point(theta, (cx, cy), pts)
-            sub_img, imgT, bbox, delta_border = self.rotate_crop_img(
-                img, pts, M)
+            sub_imgT, imgT, bbox = self.rotate_crop_img(img, landmark_, M)
 
-            outputs = self.predict_op([sub_img])[0]
+            outputs = self.predict_op([sub_imgT])[0]
             tmp_keypoints = outputs['point']
 
             for idx in range(0, len(tmp_keypoints)):
-                tmp_keypoints[idx][0] += (delta_border[0] + bbox[0])
-                tmp_keypoints[idx][1] += (delta_border[1] + bbox[1])
+                tmp_keypoints[idx][0] += bbox[0]
+                tmp_keypoints[idx][1] += bbox[1]
 
-            for idx in range(0, 3):
-                sub_img, bbox, delta_border = self.crop_img(
-                    imgT, tmp_keypoints, 0)
+            for idx in range(0, 6):
+                sub_img, bbox = self.crop_img(imgT, tmp_keypoints)
                 outputs = self.predict_op([sub_img])[0]
                 tmp_keypoints = outputs['point']
                 for idx in range(0, len(tmp_keypoints)):
-                    tmp_keypoints[idx][0] += (delta_border[0] + bbox[0])
-                    tmp_keypoints[idx][1] += (delta_border[1] + bbox[1])
+                    tmp_keypoints[idx][0] += bbox[0]
+                    tmp_keypoints[idx][1] += bbox[1]
 
             M2, tmp_keypoints = self.rotate_point(-theta, (cx, cy),
                                                   tmp_keypoints)

From 30128b698916c526d4ee4d3d77e09c58f5612621 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=AF=BF=E5=B7=9E?= <shouzhou.bx@alibaba-inc.com>
Date: Tue, 1 Nov 2022 20:42:58 +0800
Subject: [PATCH 43/46] update

---
 .../easycv_pipelines/face_2d_keypoints_pipeline.py   | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py b/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py
index 94cbb74e..29a96a5f 100644
--- a/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py
+++ b/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py
@@ -113,18 +113,6 @@ class Face2DKeypointsPipeline(EasyCVPipeline):
                                 for (x, y) in landmark])
         return M, landmark_
 
-    def random_normal(self):
-        """
-        3-sigma rule
-        return: (-1, +1)
-        """
-        mu, sigma = 0, 1
-        while True:
-            s = np.random.normal(mu, sigma)
-            if s < mu - 3 * sigma or s > mu + 3 * sigma:
-                continue
-            return s / 3 * sigma
-
     def rotate_crop_img(self, img, pts, M):
         imgT = cv2.warpAffine(img, M, (int(img.shape[1]), int(img.shape[0])))
 

From 1ca24299da877b92387c40403e2bb420489acff9 Mon Sep 17 00:00:00 2001
From: "mulin.lyh" <mulin.lyh@taobao.com>
Date: Wed, 2 Nov 2022 13:51:59 +0800
Subject: [PATCH 44/46] [to #45892407]fix: fix pytorch_lighting incompatible
 with taming-transformers-rom1504         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10604329

    * [to #45892407]fix: fix pytorch_lighting incompatible with taming-transformers-rom1504
---
 requirements/multi-modal.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/requirements/multi-modal.txt b/requirements/multi-modal.txt
index 578f0b54..31e9601d 100644
--- a/requirements/multi-modal.txt
+++ b/requirements/multi-modal.txt
@@ -2,6 +2,8 @@ ftfy>=6.0.3
 ofa>=0.0.2
 pycocoevalcap>=1.2
 pycocotools>=2.0.4
+# compatible with taming-transformers-rom1504
+pytorch_lightning<=1.7.7
 # rough-score was just recently updated from 0.0.4 to 0.0.7
 # which introduced compatability issues that are being investigated
 rouge_score<=0.0.4

From 93a52ec42d7fe5c683257f650d9449ac0f45c2cb Mon Sep 17 00:00:00 2001
From: "yingda.chen" <yingda.chen@alibaba-inc.com>
Date: Wed, 2 Nov 2022 14:07:48 +0800
Subject: [PATCH 45/46] update README

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10601974
---
 README.md | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 944c1f07..1da48ef2 100644
--- a/README.md
+++ b/README.md
@@ -1,16 +1,26 @@
 # Introduction
 
-ModelScope library is targeted to support training, evaluation and inference for the state of the art models provided by Mind and further support third-party models provided by users outside alibaba.
+[ModelScope]( https://www.modelscope.cn) is a “Model-as-a-Service” (MaaS) platform that seeks to bringing together most advanced machine learning models from the AI community, and to streamlining the process of leveraging and applying AI models . The core ModelScope library enables developers to perform model inference, training and evaluation, through rich layers of API designs that facilitate a unified experience across state-of-the-art models from different AI domains.
 
-# Design doc
+The Python library offers the layered-APIs necessary for model contributors to integrate models from CV, NLP, Speech, Multi-Modality, as well as Scientific-computation, into the ModelScope ecosystem. Implementations for all these different models are encapsulated within the library in a way that allows easy and unified access. With such integration, model inference, finetuning, and evaluations can be done within only a few lines of codes. In the meantime, flexibilities are provided so that different components in the model applications can be customized as well, where necessary.
 
-Please refer to alidoc [link](https://alidocs.dingtalk.com/i/nodes/OBldywvrKxo89xmAO05yJQk2ngpNbLz4?nav=spaces&navQuery=spaceId%3Dnb9XJNlZxbgrOXyA&iframeQuery=utm_source%3Dportal%26utm_medium%3Dportal_space_file_tree)
+Apart from harboring implementations of various models, ModelScope library also enables the necessary interactions with the backend services of ModelScope, particularly with the Model-Hub and Dataset-Hub. Such interactions facilitate various entity (models and datasets) management to be performed seamlessly under-the-hood, such as entity lookup, version control, and cache management.
 
-# Development doc
+# Installation
 
-Please refer to [develop.md](docs/source/develop.md)
+Please refer to [installation](https://modelscope.cn/docs/%E7%8E%AF%E5%A2%83%E5%AE%89%E8%A3%85).
 
-# ChangeLog
-* 20/05/2022 First release version
+# Get Started
 
-Refer to [change_log.md](docs/source/change_log.md) for more details
+You can refer to [quick_start](https://modelscope.cn/docs/%E5%BF%AB%E9%80%9F%E5%BC%80%E5%A7%8B) for quick start.
+
+We also provide other documentations including:
+* [Introduction to tasks](https://modelscope.cn/docs/%E4%BB%BB%E5%8A%A1%E7%9A%84%E4%BB%8B%E7%BB%8D)
+* [Use pipeline for model inference](https://modelscope.cn/docs/%E6%A8%A1%E5%9E%8B%E7%9A%84%E6%8E%A8%E7%90%86Pipeline)
+* [Finetune example](https://modelscope.cn/docs/%E6%A8%A1%E5%9E%8B%E7%9A%84%E8%AE%AD%E7%BB%83Train)
+* [Preprocessing of data](https://modelscope.cn/docs/%E6%95%B0%E6%8D%AE%E7%9A%84%E9%A2%84%E5%A4%84%E7%90%86)
+* [Evaluation metrics](https://modelscope.cn/docs/%E6%A8%A1%E5%9E%8B%E7%9A%84%E8%AF%84%E4%BC%B0)
+
+# License
+
+This project is licensed under the [Apache License (Version 2.0)](https://github.com/modelscope/modelscope/blob/master/LICENSE).

From 5f1b9a621871f310ee44138c62b588bbc7d83c73 Mon Sep 17 00:00:00 2001
From: "yichang.zyc" <yichang.zyc@alibaba-inc.com>
Date: Wed, 2 Nov 2022 14:23:26 +0800
Subject: [PATCH 46/46] add default config and fix proprocess detokenizer

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10603232
---
 .../models/multi_modal/ofa_for_all_tasks.py   | 18 ++++++++++++++-
 modelscope/preprocessors/multi_modal.py       |  2 +-
 .../preprocessors/ofa/ocr_recognition.py      | 13 +++--------
 .../multi_modal/ofa/ofa_trainer_utils.py      | 22 +++++++++----------
 4 files changed, 32 insertions(+), 23 deletions(-)

diff --git a/modelscope/models/multi_modal/ofa_for_all_tasks.py b/modelscope/models/multi_modal/ofa_for_all_tasks.py
index 2c6034e8..fc578b25 100644
--- a/modelscope/models/multi_modal/ofa_for_all_tasks.py
+++ b/modelscope/models/multi_modal/ofa_for_all_tasks.py
@@ -1,6 +1,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import math
 import os
+import re
 import string
 from functools import partial
 from os import path as osp
@@ -110,6 +111,8 @@ class OfaForAllTasks(TorchModel):
             Tasks.text_classification: inference_d[self.gen_type],
             Tasks.image_classification: inference_d[self.gen_type],
         }
+        pattern_str = '((?<=[^ a-zA-Z0-9.,:!?]) +| +(?=[^ a-zA-Z0-9.,:!?]))'
+        self.pattern = re.compile(pattern_str)
 
     def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
         input = move_to_device(input, self.model.device)
@@ -135,8 +138,18 @@ class OfaForAllTasks(TorchModel):
             caption = input[OutputKeys.CAPTION]
             result_l = list()
             for cap in caption:
-                result_l.append(cap.translate(self.transtab).strip())
+                if self.language == 'en':
+                    result_l.append(cap.translate(self.transtab).strip())
+                else:
+                    result_l.append(cap)
             input[OutputKeys.CAPTION] = result_l
+        if self.gen_type == 'generation' and self.language in [
+                'zh', 'cn'
+        ] and self.cfg.task != Tasks.visual_grounding:
+            ret_l = list()
+            for text in input[OFA_TASK_KEY_MAPPING[self.cfg.task]]:
+                ret_l.append(self.detokenizer(text))
+            input[OFA_TASK_KEY_MAPPING[self.cfg.task]] = ret_l
         return input
 
     def _text_gen_inference(self, input):
@@ -314,3 +327,6 @@ class OfaForAllTasks(TorchModel):
                             save_function=partial(save_function, with_meta=False),
                             config=config,
                             **kwargs)
+
+    def detokenizer(self, text):
+        return self.pattern.sub('', text)
diff --git a/modelscope/preprocessors/multi_modal.py b/modelscope/preprocessors/multi_modal.py
index 13876058..3a3ae820 100644
--- a/modelscope/preprocessors/multi_modal.py
+++ b/modelscope/preprocessors/multi_modal.py
@@ -77,7 +77,7 @@ class OfaPreprocessor(Preprocessor):
             data[key] = item
         return data
 
-    def _ofa_input_compatibility_conversion(self, data):
+    def _ofa_input_compatibility_conversion(self, data):  # fake
         if 'image' in data and self.cfg.model.get('type', None) == 'ofa':
             if isinstance(data['image'], str):
                 image = load_image(data['image'])
diff --git a/modelscope/preprocessors/ofa/ocr_recognition.py b/modelscope/preprocessors/ofa/ocr_recognition.py
index a0342c14..58e3ea6e 100644
--- a/modelscope/preprocessors/ofa/ocr_recognition.py
+++ b/modelscope/preprocessors/ofa/ocr_recognition.py
@@ -73,21 +73,14 @@ class OfaOcrRecognitionPreprocessor(OfaBasePreprocessor):
         """
         super(OfaOcrRecognitionPreprocessor,
               self).__init__(cfg, model_dir, mode, *args, **kwargs)
-        # Initialize transform
-        if self.cfg.model.imagenet_default_mean_and_std:
-            mean = IMAGENET_DEFAULT_MEAN
-            std = IMAGENET_DEFAULT_STD
-        else:
-            mean = [0.5, 0.5, 0.5]
-            std = [0.5, 0.5, 0.5]
 
         self.patch_resize_transform = transforms.Compose([
             lambda image: ocr_resize(
                 image,
-                self.cfg.model.patch_image_size,
-                is_document=self.cfg.model.is_document),
+                self.patch_image_size,
+                is_document=self.cfg.model.get('is_document', False)),
             transforms.ToTensor(),
-            transforms.Normalize(mean=mean, std=std),
+            transforms.Normalize(mean=self.mean, std=self.std),
         ])
 
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
diff --git a/modelscope/trainers/multi_modal/ofa/ofa_trainer_utils.py b/modelscope/trainers/multi_modal/ofa/ofa_trainer_utils.py
index 3c38884c..3930febb 100644
--- a/modelscope/trainers/multi_modal/ofa/ofa_trainer_utils.py
+++ b/modelscope/trainers/multi_modal/ofa/ofa_trainer_utils.py
@@ -103,20 +103,20 @@ class AdjustLabelSmoothedCrossEntropyCriterion(_Loss):
 
     def __init__(self, args):
         super().__init__()
-        self.sentence_avg = args.sentence_avg
-        self.eps = args.label_smoothing
-        self.ignore_prefix_size = args.ignore_prefix_size
-        self.ignore_eos = args.ignore_eos
-        self.report_accuracy = args.report_accuracy
-        self.drop_worst_ratio = args.drop_worst_ratio
-        self.drop_worst_after = args.drop_worst_after
-        self.use_rdrop = args.use_rdrop
-        self.reg_alpha = args.reg_alpha
-        self.sample_patch_num = args.sample_patch_num
+        self.sentence_avg = args.get('sentence_avg', False)
+        self.eps = args.get('label_smoothing', 0.1)
+        self.ignore_prefix_size = args.get('ignore_prefix_size', 0)
+        self.ignore_eos = args.get('ignore_eos', False)
+        self.report_accuracy = args.get('report_accuracy', False)
+        self.drop_worst_ratio = args.get('drop_worst_ratio', 0.0)
+        self.drop_worst_after = args.get('drop_worst_after', 0)
+        self.use_rdrop = args.get('use_rdrop', False)
+        self.reg_alpha = args.get('reg_alpha', 1.0)
+        self.sample_patch_num = args.get('sample_patch_num', 196)
 
         self.constraint_start = None
         self.constraint_end = None
-        if args.constraint_range:
+        if args.get('constraint_range', None):
             constraint_start, constraint_end = args.constraint_range.split(',')
             self.constraint_start = int(constraint_start)
             self.constraint_end = int(constraint_end)