From d7298862b0d851b422b685824c08f06175d4441d Mon Sep 17 00:00:00 2001
From: "wenmeng.zwm" <wenmeng.zwm@alibaba-inc.com>
Date: Sun, 9 Oct 2022 18:12:47 +0800
Subject: [PATCH 01/29] add citest and inter test

---
 .github/workflows/citest.yaml | 55 +++++++++++++++++++++++++++++++++++
 .github/workflows/lint.yaml   | 24 +++++++++++++++
 2 files changed, 79 insertions(+)
 create mode 100644 .github/workflows/citest.yaml
 create mode 100644 .github/workflows/lint.yaml

diff --git a/.github/workflows/citest.yaml b/.github/workflows/citest.yaml
new file mode 100644
index 00000000..65cde97e
--- /dev/null
+++ b/.github/workflows/citest.yaml
@@ -0,0 +1,55 @@
+name: citest
+
+on:
+  push:
+    branches:
+      - master
+      - "release/**"
+    paths-ignore:
+      - "setup.*"
+      - "requirements.txt"
+      - "requirements/**"
+      - "docs/**"
+      - "tools/**"
+      - ".dev_scripts/**"
+      - "README.md"
+      - "README_zh-CN.md"
+      - "NOTICE"
+      - ".github/workflows/lint.yaml"
+      - ".github/workflows/publish.yaml"
+
+  pull_request:
+    paths-ignore:
+      - "setup.*"
+      - "requirements.txt"
+      - "requirements/**"
+      - "docs/**"
+      - "tools/**"
+      - ".dev_scripts/**"
+      - "README.md"
+      - "README_zh-CN.md"
+      - "NOTICE"
+      - ".github/workflows/lint.yaml"
+      - ".github/workflows/publish.yaml"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  unittest:
+    # The type of runner that the job will run on
+    runs-on: [modelscope-self-hosted]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v2
+        with:
+            lfs: 'true'
+      - name: Checkout LFS objects
+        run: git lfs checkout
+      - name: Run unittest
+        shell: bash
+        run: |
+          set -e
+          source ~/ci_env.sh
+          bash .dev_scripts/dockerci.sh  
diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml
new file mode 100644
index 00000000..1ac76975
--- /dev/null
+++ b/.github/workflows/lint.yaml
@@ -0,0 +1,24 @@
+name: Lint test
+
+on: [push, pull_request]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python 3.6
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.6
+      - name: Install pre-commit hook
+        run: |
+          pip install pre-commit
+          cp .github/hooks/pre-commit .git/hooks/
+      - name: Linting
+        run: pre-commit run --all-files
+

From 8be89d9f6ce10b6a67938318754a2ceb811721f9 Mon Sep 17 00:00:00 2001
From: "wenmeng.zwm" <wenmeng.zwm@alibaba-inc.com>
Date: Sun, 9 Oct 2022 18:16:46 +0800
Subject: [PATCH 02/29] update lint.yaml

---
 .github/workflows/lint.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml
index 1ac76975..8a073a01 100644
--- a/.github/workflows/lint.yaml
+++ b/.github/workflows/lint.yaml
@@ -18,7 +18,6 @@ jobs:
       - name: Install pre-commit hook
         run: |
           pip install pre-commit
-          cp .github/hooks/pre-commit .git/hooks/
       - name: Linting
         run: pre-commit run --all-files
 

From 208823401fb16541b773dcea87d04f5b145ba25e Mon Sep 17 00:00:00 2001
From: "wenmeng.zwm" <wenmeng.zwm@alibaba-inc.com>
Date: Sun, 9 Oct 2022 18:29:06 +0800
Subject: [PATCH 03/29] fix format

---
 .github/workflows/citest.yaml | 2 +-
 .github/workflows/lint.yaml   | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/citest.yaml b/.github/workflows/citest.yaml
index 65cde97e..82b713e5 100644
--- a/.github/workflows/citest.yaml
+++ b/.github/workflows/citest.yaml
@@ -52,4 +52,4 @@ jobs:
         run: |
           set -e
           source ~/ci_env.sh
-          bash .dev_scripts/dockerci.sh  
+          bash .dev_scripts/dockerci.sh
diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml
index 8a073a01..34f7abe7 100644
--- a/.github/workflows/lint.yaml
+++ b/.github/workflows/lint.yaml
@@ -20,4 +20,3 @@ jobs:
           pip install pre-commit
       - name: Linting
         run: pre-commit run --all-files
-

From 6e9ab972d4a9ab3b0d458d91ae42a99a04ea61b4 Mon Sep 17 00:00:00 2001
From: "wenmeng.zwm" <wenmeng.zwm@alibaba-inc.com>
Date: Mon, 10 Oct 2022 10:35:27 +0800
Subject: [PATCH 04/29] update cienv path

---
 .github/workflows/citest.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/citest.yaml b/.github/workflows/citest.yaml
index 82b713e5..b16d3f16 100644
--- a/.github/workflows/citest.yaml
+++ b/.github/workflows/citest.yaml
@@ -51,5 +51,5 @@ jobs:
         shell: bash
         run: |
           set -e
-          source ~/ci_env.sh
+          source /mnt/modelscope/ci_env.sh
           bash .dev_scripts/dockerci.sh

From e7b37b09d4c753cdf081aab9fb7fcd3addcde500 Mon Sep 17 00:00:00 2001
From: "wenmeng.zwm" <wenmeng.zwm@alibaba-inc.com>
Date: Mon, 10 Oct 2022 12:03:05 +0800
Subject: [PATCH 05/29] do linter test only in alibaba internal env

---
 .dev_scripts/ci_container_test.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.dev_scripts/ci_container_test.sh b/.dev_scripts/ci_container_test.sh
index 129a6c25..95088f30 100644
--- a/.dev_scripts/ci_container_test.sh
+++ b/.dev_scripts/ci_container_test.sh
@@ -9,7 +9,9 @@ git config --global --add safe.directory /Maas-lib
 
 # linter test
 # use internal project for pre-commit due to the network problem
-pre-commit run -c .pre-commit-config_local.yaml --all-files
+if [ `git remote -v | grep alibaba  | wc -l` -gt 1 ]; then
+    pre-commit run -c .pre-commit-config_local.yaml --all-files
+fi
 if [ $? -ne 0 ]; then
     echo "linter test failed, please run 'pre-commit run --all-files' to check"
     exit -1

From ca7189e77a18ce0990788ed92b8fb8a695db41a0 Mon Sep 17 00:00:00 2001
From: "wenmeng.zwm" <wenmeng.zwm@alibaba-inc.com>
Date: Mon, 10 Oct 2022 17:38:03 +0800
Subject: [PATCH 06/29] add dummy git config

---
 .dev_scripts/ci_container_test.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.dev_scripts/ci_container_test.sh b/.dev_scripts/ci_container_test.sh
index 95088f30..fa5e4534 100644
--- a/.dev_scripts/ci_container_test.sh
+++ b/.dev_scripts/ci_container_test.sh
@@ -6,6 +6,8 @@ awk -F: '/^[^#]/ { print $1 }' requirements/nlp.txt | xargs -n 1 pip install -f
 pip install -r requirements/tests.txt
 
 git config --global --add safe.directory /Maas-lib
+git config --global user.email tmp
+git config --global user.name tmp.com
 
 # linter test
 # use internal project for pre-commit due to the network problem

From 51e268dc9779e53bc04d6fdffe49c9b68c4c57b3 Mon Sep 17 00:00:00 2001
From: "wenmeng.zwm" <wenmeng.zwm@alibaba-inc.com>
Date: Wed, 12 Oct 2022 11:20:07 +0800
Subject: [PATCH 07/29] change filemode

---
 .github/workflows/citest.yaml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/.github/workflows/citest.yaml b/.github/workflows/citest.yaml
index b16d3f16..3d2abae9 100644
--- a/.github/workflows/citest.yaml
+++ b/.github/workflows/citest.yaml
@@ -41,6 +41,14 @@ jobs:
     # The type of runner that the job will run on
     runs-on: [modelscope-self-hosted]
     steps:
+      - name: ResetFileMode
+        shell: bash
+        run: |
+          # reset filemode to allow action runner to delete files 
+          # generated by root in docker
+          set -e
+          sudo chown -R $USER:$USER  $ACTION_RUNNER_DIR 
+
       - name: Checkout
         uses: actions/checkout@v2
         with:

From 66819860b78d34d1c5106c6938b9a7c25c982d0d Mon Sep 17 00:00:00 2001
From: "wenmeng.zwm" <wenmeng.zwm@alibaba-inc.com>
Date: Wed, 12 Oct 2022 11:33:59 +0800
Subject: [PATCH 08/29] format yaml

---
 .github/workflows/citest.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/citest.yaml b/.github/workflows/citest.yaml
index 3d2abae9..29f4a0b9 100644
--- a/.github/workflows/citest.yaml
+++ b/.github/workflows/citest.yaml
@@ -45,7 +45,7 @@ jobs:
         shell: bash
         run: |
           # reset filemode to allow action runner to delete files 
-          # generated by root in docker
+          # generated by root in docker 
           set -e
           sudo chown -R $USER:$USER  $ACTION_RUNNER_DIR 
 

From 705040c04ab7404510febfae0ace369d546b371d Mon Sep 17 00:00:00 2001
From: "wenmeng.zwm" <wenmeng.zwm@alibaba-inc.com>
Date: Wed, 12 Oct 2022 11:53:16 +0800
Subject: [PATCH 09/29] format

---
 .github/workflows/citest.yaml | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/citest.yaml b/.github/workflows/citest.yaml
index 29f4a0b9..ede4b94f 100644
--- a/.github/workflows/citest.yaml
+++ b/.github/workflows/citest.yaml
@@ -44,10 +44,11 @@ jobs:
       - name: ResetFileMode
         shell: bash
         run: |
-          # reset filemode to allow action runner to delete files 
-          # generated by root in docker 
+          # reset filemode to allow action runner to delete files
+          # generated by root in docker
           set -e
-          sudo chown -R $USER:$USER  $ACTION_RUNNER_DIR 
+          echo "ACTION_RUNNER_DIR: $ACTION_RUNNER_DIR"
+          sudo chown -R $USER:$USER  $ACTION_RUNNER_DIR
 
       - name: Checkout
         uses: actions/checkout@v2

From 7962f9067daedefa37f8f41d4e56840b6c691083 Mon Sep 17 00:00:00 2001
From: "wenmeng.zwm" <wenmeng.zwm@alibaba-inc.com>
Date: Wed, 12 Oct 2022 11:56:36 +0800
Subject: [PATCH 10/29] format

---
 .github/workflows/citest.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/citest.yaml b/.github/workflows/citest.yaml
index ede4b94f..00c6bbbf 100644
--- a/.github/workflows/citest.yaml
+++ b/.github/workflows/citest.yaml
@@ -47,7 +47,7 @@ jobs:
           # reset filemode to allow action runner to delete files
           # generated by root in docker
           set -e
-          echo "ACTION_RUNNER_DIR: $ACTION_RUNNER_DIR"
+          source ~/.bashrc
           sudo chown -R $USER:$USER  $ACTION_RUNNER_DIR
 
       - name: Checkout

From d325c41d1fbe8cd5aec8da72bad3ea16ce70c207 Mon Sep 17 00:00:00 2001
From: "wenmeng.zwm" <wenmeng.zwm@alibaba-inc.com>
Date: Wed, 12 Oct 2022 13:29:24 +0800
Subject: [PATCH 11/29] make movie_scene_seg isolated

---
 tests/run_config.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/run_config.yaml b/tests/run_config.yaml
index 4c571b7f..4bbdb92f 100644
--- a/tests/run_config.yaml
+++ b/tests/run_config.yaml
@@ -10,6 +10,7 @@ isolated:  # test cases that may require excessive anmount of GPU memory, which
   - test_easycv_trainer.py
   - test_segformer.py
   - test_segmentation_pipeline.py
+  - test_movie_scene_segmentation.py
 
 envs:
   default: # default env, case not in other env will in default, pytorch.

From 687766d9f82147ab96b6326685e47a49dd970681 Mon Sep 17 00:00:00 2001
From: "wenmeng.zwm" <wenmeng.zwm@alibaba-inc.com>
Date: Mon, 17 Oct 2022 15:04:07 +0800
Subject: [PATCH 12/29] refactor readme and remove internal links

---
 README.md                  | 24 +++++++++++++++++-------
 docs/source/develop.md     |  6 ++----
 docs/source/quick_start.md |  2 +-
 3 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index 944c1f07..61c3207a 100644
--- a/README.md
+++ b/README.md
@@ -2,15 +2,25 @@
 
 ModelScope library is targeted to support training, evaluation and inference for the state of the art models provided by Mind and further support third-party models provided by users outside alibaba.
 
-# Design doc
+In order to enable ModelScope users to use the various models provided by ModelScope quickly and conveniently, we provide a set of complete Python library, which includes the implementation of ModelScope official models, inference, finetuning and evaluation support for those models such as preprocessor and evaluation metrics. We also provide easy-to-use APIs and rich usage examples. By calling the library, users can write just a few lines of code to complete tasks such as model inference, training, and evaluation, and can also quickly carry out secondary development on this basis to realize their own innovative ideas.
 
-Please refer to alidoc [link](https://alidocs.dingtalk.com/i/nodes/OBldywvrKxo89xmAO05yJQk2ngpNbLz4?nav=spaces&navQuery=spaceId%3Dnb9XJNlZxbgrOXyA&iframeQuery=utm_source%3Dportal%26utm_medium%3Dportal_space_file_tree)
+At present, the algorithm models provided by library cover four main AI fields of image, natural language processing, speech, and multi-modality, and dozens of application scenarios and tasks.
 
-# Development doc
+# Installation
 
-Please refer to [develop.md](docs/source/develop.md)
+Please refer to [installation](https://modelscope.cn/docs/%E7%8E%AF%E5%A2%83%E5%AE%89%E8%A3%85).
 
-# ChangeLog
-* 20/05/2022 First release version
+# Get Started
 
-Refer to [change_log.md](docs/source/change_log.md) for more details
+You can refer to [quick_start](https://modelscope.cn/docs/%E5%BF%AB%E9%80%9F%E5%BC%80%E5%A7%8B) for quick start.
+
+We also provide other documentations including:
+* [Introduction to tasks](https://modelscope.cn/docs/%E4%BB%BB%E5%8A%A1%E7%9A%84%E4%BB%8B%E7%BB%8D)
+* [Use pipeline for model inference](https://modelscope.cn/docs/%E6%A8%A1%E5%9E%8B%E7%9A%84%E6%8E%A8%E7%90%86Pipeline)
+* [Finetune example](https://modelscope.cn/docs/%E6%A8%A1%E5%9E%8B%E7%9A%84%E8%AE%AD%E7%BB%83Train)
+* [Preprocessing of data](https://modelscope.cn/docs/%E6%95%B0%E6%8D%AE%E7%9A%84%E9%A2%84%E5%A4%84%E7%90%86)
+* [Evaluation metrics](https://modelscope.cn/docs/%E6%A8%A1%E5%9E%8B%E7%9A%84%E8%AF%84%E4%BC%B0)
+
+# License
+
+This project is licensed under the [Apache License (Version 2.0)](https://github.com/modelscope/modelscope/blob/master/LICENSE).
diff --git a/docs/source/develop.md b/docs/source/develop.md
index fad87d33..62801353 100644
--- a/docs/source/develop.md
+++ b/docs/source/develop.md
@@ -44,7 +44,7 @@ There are mainly three test levels:
 * level 2: scenario tests for all the implemented modules such as model, pipeline in different algorithm filed.
 
 Default test level is 0, which will only run those cases of level 0, you can set test level
-via environment variable `TEST_LEVEL`. For more details, you can refer to [test-doc](https://alidocs.dingtalk.com/i/nodes/mdvQnONayjBJKLXy1Bp38PY2MeXzp5o0?dontjump=true&nav=spaces&navQuery=spaceId%3Dnb9XJNlZxbgrOXyA)
+via environment variable `TEST_LEVEL`.
 
 
 ```bash
@@ -159,9 +159,7 @@ git pull origin branch_name
     git push --set-upstream origin dev/my-dev-branch
     ```
    Note that you may push multiple times to the same branch with 'git push' commands later.
-5. Open the remote url `https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/new` to create a new merge request that merges your development branch (aka, the "dev/my-dev-branch in this example) into master branch. Please follow the instruction on aone page to submit the merge request a code review.
-
-
+5. Create a pull request on github to merge your code into master.
 
 ## Build pip package
 ```bash
diff --git a/docs/source/quick_start.md b/docs/source/quick_start.md
index 68979c55..7cefa048 100644
--- a/docs/source/quick_start.md
+++ b/docs/source/quick_start.md
@@ -74,7 +74,7 @@ pip install "modelscope[multi-modal]" -f https://modelscope.oss-cn-beijing.aliyu
 ModelScope的源码可以直接clone到本地：
 
 ```shell
-git clone git@gitlab.alibaba-inc.com:Ali-MaaS/MaaS-lib.git modelscope
+git clone git@github.com:modelscope/modelscope.git
 cd modelscope
 git fetch origin master
 git checkout master

From 78bf480662191b477fbeb84b715e236f705deb09 Mon Sep 17 00:00:00 2001
From: "wenmeng.zwm" <wenmeng.zwm@alibaba-inc.com>
Date: Wed, 26 Oct 2022 19:16:17 +0800
Subject: [PATCH 13/29] update linter test with py37

---
 .github/workflows/lint.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml
index 34f7abe7..dc4b5487 100644
--- a/.github/workflows/lint.yaml
+++ b/.github/workflows/lint.yaml
@@ -11,10 +11,10 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v2
-      - name: Set up Python 3.6
+      - name: Set up Python 3.7
         uses: actions/setup-python@v2
         with:
-          python-version: 3.6
+          python-version: 3.7
       - name: Install pre-commit hook
         run: |
           pip install pre-commit

From c390dc0c791442314c7515e755008902c1b88a1b Mon Sep 17 00:00:00 2001
From: Yufeng <47727949+shuaigezhu@users.noreply.github.com>
Date: Fri, 28 Oct 2022 17:09:27 +0800
Subject: [PATCH 14/29] add Mglm (#5)

* mglm init

* add mglm requirements

Co-authored-by: Yufeng <zhuyufeng@gmail.com>
Co-authored-by: wenmeng.zwm <wenmeng.zwm@alibaba-inc.com>
---
 modelscope/metainfo.py                        |    3 +
 modelscope/models/nlp/__init__.py             |    2 +
 modelscope/models/nlp/mglm/__init__.py        |   22 +
 modelscope/models/nlp/mglm/arguments.py       |  793 +++++++++
 modelscope/models/nlp/mglm/blocklm_utils.py   |  625 +++++++
 modelscope/models/nlp/mglm/configure_data.py  |  513 ++++++
 .../models/nlp/mglm/data_utils/__init__.py    |  341 ++++
 .../models/nlp/mglm/data_utils/corpora.py     |  583 ++++++
 .../models/nlp/mglm/data_utils/datasets.py    | 1244 +++++++++++++
 .../models/nlp/mglm/data_utils/extraction.py  |   71 +
 .../models/nlp/mglm/data_utils/file_utils.py  |  256 +++
 .../models/nlp/mglm/data_utils/lazy_loader.py |  286 +++
 .../models/nlp/mglm/data_utils/samplers.py    |  190 ++
 .../nlp/mglm/data_utils/sp_tokenizer.py       |  158 ++
 .../nlp/mglm/data_utils/tokenization.py       | 1396 +++++++++++++++
 .../nlp/mglm/data_utils/tokenization_gpt2.py  |  359 ++++
 .../models/nlp/mglm/data_utils/wordpiece.py   |  408 +++++
 modelscope/models/nlp/mglm/fp16/__init__.py   |   20 +
 modelscope/models/nlp/mglm/fp16/fp16.py       |  660 +++++++
 modelscope/models/nlp/mglm/fp16/fp16util.py   |  220 +++
 .../models/nlp/mglm/fp16/loss_scaler.py       |  245 +++
 .../models/nlp/mglm/generation_utils.py       |  483 +++++
 .../nlp/mglm/mglm_for_text_summarization.py   |  469 +++++
 modelscope/models/nlp/mglm/model/__init__.py  |   20 +
 .../models/nlp/mglm/model/distributed.py      |  127 ++
 .../models/nlp/mglm/model/downstream.py       |  242 +++
 .../models/nlp/mglm/model/modeling_bert.py    | 1576 +++++++++++++++++
 .../models/nlp/mglm/model/modeling_glm.py     |  245 +++
 modelscope/models/nlp/mglm/model/prompt.py    |   59 +
 modelscope/models/nlp/mglm/mpu/__init__.py    |   37 +
 .../models/nlp/mglm/mpu/cross_entropy.py      |  110 ++
 modelscope/models/nlp/mglm/mpu/data.py        |  117 ++
 modelscope/models/nlp/mglm/mpu/grads.py       |   72 +
 modelscope/models/nlp/mglm/mpu/initialize.py  |  130 ++
 modelscope/models/nlp/mglm/mpu/layers.py      |  357 ++++
 modelscope/models/nlp/mglm/mpu/mappings.py    |  144 ++
 modelscope/models/nlp/mglm/mpu/random.py      |  408 +++++
 .../models/nlp/mglm/mpu/tests/__init__.py     |    0
 .../models/nlp/mglm/mpu/tests/commons.py      |   86 +
 .../nlp/mglm/mpu/tests/test_cross_entropy.py  |  106 ++
 .../models/nlp/mglm/mpu/tests/test_data.py    |   91 +
 .../nlp/mglm/mpu/tests/test_initialize.py     |   95 +
 .../models/nlp/mglm/mpu/tests/test_layers.py  |  533 ++++++
 .../models/nlp/mglm/mpu/tests/test_random.py  |  206 +++
 modelscope/models/nlp/mglm/mpu/transformer.py | 1200 +++++++++++++
 modelscope/models/nlp/mglm/mpu/utils.py       |   70 +
 modelscope/models/nlp/mglm/process_grid.py    |   61 +
 modelscope/models/nlp/mglm/requirements.txt   |   22 +
 modelscope/models/nlp/mglm/run_test.py        |   10 +
 .../models/nlp/mglm/tasks/data_utils.py       |  389 ++++
 .../models/nlp/mglm/tasks/eval_utils.py       |  249 +++
 .../nlp/mglm/tasks/language_model/dataset.py  |  249 +++
 .../mglm/tasks/language_model/detokenizer.py  |   63 +
 .../nlp/mglm/tasks/language_model/finetune.py |  254 +++
 .../models/nlp/mglm/tasks/seq2seq/dataset.py  |  667 +++++++
 .../models/nlp/mglm/tasks/seq2seq/evaluate.py |  538 ++++++
 .../models/nlp/mglm/tasks/seq2seq/finetune.py |  151 ++
 .../models/nlp/mglm/tasks/superglue/README.md |  137 ++
 .../nlp/mglm/tasks/superglue/__init__.py      |    0
 .../nlp/mglm/tasks/superglue/dataset.py       | 1475 +++++++++++++++
 .../nlp/mglm/tasks/superglue/evaluate.py      |  101 ++
 .../nlp/mglm/tasks/superglue/finetune.py      |  138 ++
 .../models/nlp/mglm/tasks/superglue/pvp.py    | 1541 ++++++++++++++++
 modelscope/models/nlp/mglm/test/__init__.py   |    0
 modelscope/models/nlp/mglm/test/test_block.py |   36 +
 .../models/nlp/mglm/test/test_rel_shift.py    |   27 +
 modelscope/models/nlp/mglm/train_utils.py     |  472 +++++
 modelscope/models/nlp/mglm/utils.py           |  529 ++++++
 modelscope/outputs/outputs.py                 |    6 +
 modelscope/pipelines/nlp/__init__.py          |    2 +
 .../nlp/mglm_text_summarization_pipeline.py   |   43 +
 modelscope/preprocessors/__init__.py          |   19 +-
 modelscope/preprocessors/nlp/__init__.py      |    2 +
 .../nlp/mglm_summarization_preprocessor.py    |   32 +
 requirements/nlp.txt                          |   15 +-
 .../pipelines/test_mglm_text_summarization.py |   47 +
 76 files changed, 22640 insertions(+), 13 deletions(-)
 create mode 100644 modelscope/models/nlp/mglm/__init__.py
 create mode 100755 modelscope/models/nlp/mglm/arguments.py
 create mode 100644 modelscope/models/nlp/mglm/blocklm_utils.py
 create mode 100644 modelscope/models/nlp/mglm/configure_data.py
 create mode 100644 modelscope/models/nlp/mglm/data_utils/__init__.py
 create mode 100755 modelscope/models/nlp/mglm/data_utils/corpora.py
 create mode 100644 modelscope/models/nlp/mglm/data_utils/datasets.py
 create mode 100644 modelscope/models/nlp/mglm/data_utils/extraction.py
 create mode 100755 modelscope/models/nlp/mglm/data_utils/file_utils.py
 create mode 100644 modelscope/models/nlp/mglm/data_utils/lazy_loader.py
 create mode 100644 modelscope/models/nlp/mglm/data_utils/samplers.py
 create mode 100644 modelscope/models/nlp/mglm/data_utils/sp_tokenizer.py
 create mode 100755 modelscope/models/nlp/mglm/data_utils/tokenization.py
 create mode 100644 modelscope/models/nlp/mglm/data_utils/tokenization_gpt2.py
 create mode 100755 modelscope/models/nlp/mglm/data_utils/wordpiece.py
 create mode 100644 modelscope/models/nlp/mglm/fp16/__init__.py
 create mode 100755 modelscope/models/nlp/mglm/fp16/fp16.py
 create mode 100644 modelscope/models/nlp/mglm/fp16/fp16util.py
 create mode 100755 modelscope/models/nlp/mglm/fp16/loss_scaler.py
 create mode 100644 modelscope/models/nlp/mglm/generation_utils.py
 create mode 100644 modelscope/models/nlp/mglm/mglm_for_text_summarization.py
 create mode 100755 modelscope/models/nlp/mglm/model/__init__.py
 create mode 100755 modelscope/models/nlp/mglm/model/distributed.py
 create mode 100644 modelscope/models/nlp/mglm/model/downstream.py
 create mode 100644 modelscope/models/nlp/mglm/model/modeling_bert.py
 create mode 100644 modelscope/models/nlp/mglm/model/modeling_glm.py
 create mode 100644 modelscope/models/nlp/mglm/model/prompt.py
 create mode 100755 modelscope/models/nlp/mglm/mpu/__init__.py
 create mode 100644 modelscope/models/nlp/mglm/mpu/cross_entropy.py
 create mode 100644 modelscope/models/nlp/mglm/mpu/data.py
 create mode 100644 modelscope/models/nlp/mglm/mpu/grads.py
 create mode 100644 modelscope/models/nlp/mglm/mpu/initialize.py
 create mode 100644 modelscope/models/nlp/mglm/mpu/layers.py
 create mode 100644 modelscope/models/nlp/mglm/mpu/mappings.py
 create mode 100755 modelscope/models/nlp/mglm/mpu/random.py
 create mode 100644 modelscope/models/nlp/mglm/mpu/tests/__init__.py
 create mode 100644 modelscope/models/nlp/mglm/mpu/tests/commons.py
 create mode 100644 modelscope/models/nlp/mglm/mpu/tests/test_cross_entropy.py
 create mode 100644 modelscope/models/nlp/mglm/mpu/tests/test_data.py
 create mode 100644 modelscope/models/nlp/mglm/mpu/tests/test_initialize.py
 create mode 100644 modelscope/models/nlp/mglm/mpu/tests/test_layers.py
 create mode 100644 modelscope/models/nlp/mglm/mpu/tests/test_random.py
 create mode 100755 modelscope/models/nlp/mglm/mpu/transformer.py
 create mode 100644 modelscope/models/nlp/mglm/mpu/utils.py
 create mode 100644 modelscope/models/nlp/mglm/process_grid.py
 create mode 100644 modelscope/models/nlp/mglm/requirements.txt
 create mode 100644 modelscope/models/nlp/mglm/run_test.py
 create mode 100644 modelscope/models/nlp/mglm/tasks/data_utils.py
 create mode 100644 modelscope/models/nlp/mglm/tasks/eval_utils.py
 create mode 100644 modelscope/models/nlp/mglm/tasks/language_model/dataset.py
 create mode 100755 modelscope/models/nlp/mglm/tasks/language_model/detokenizer.py
 create mode 100644 modelscope/models/nlp/mglm/tasks/language_model/finetune.py
 create mode 100644 modelscope/models/nlp/mglm/tasks/seq2seq/dataset.py
 create mode 100644 modelscope/models/nlp/mglm/tasks/seq2seq/evaluate.py
 create mode 100644 modelscope/models/nlp/mglm/tasks/seq2seq/finetune.py
 create mode 100644 modelscope/models/nlp/mglm/tasks/superglue/README.md
 create mode 100644 modelscope/models/nlp/mglm/tasks/superglue/__init__.py
 create mode 100644 modelscope/models/nlp/mglm/tasks/superglue/dataset.py
 create mode 100644 modelscope/models/nlp/mglm/tasks/superglue/evaluate.py
 create mode 100644 modelscope/models/nlp/mglm/tasks/superglue/finetune.py
 create mode 100644 modelscope/models/nlp/mglm/tasks/superglue/pvp.py
 create mode 100644 modelscope/models/nlp/mglm/test/__init__.py
 create mode 100644 modelscope/models/nlp/mglm/test/test_block.py
 create mode 100644 modelscope/models/nlp/mglm/test/test_rel_shift.py
 create mode 100644 modelscope/models/nlp/mglm/train_utils.py
 create mode 100644 modelscope/models/nlp/mglm/utils.py
 create mode 100644 modelscope/pipelines/nlp/mglm_text_summarization_pipeline.py
 create mode 100644 modelscope/preprocessors/nlp/mglm_summarization_preprocessor.py
 create mode 100644 tests/pipelines/test_mglm_text_summarization.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index a671ded5..3951541c 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -82,6 +82,7 @@ class Models(object):
     bert_for_ds = 'bert-for-document-segmentation'
     ponet = 'ponet'
     T5 = 'T5'
+    mglm = 'mglm'
     bloom = 'bloom'
 
     # audio models
@@ -251,6 +252,7 @@ class Pipelines(object):
     relation_extraction = 'relation-extraction'
     document_segmentation = 'document-segmentation'
     feature_extraction = 'feature-extraction'
+    mglm_text_summarization = 'mglm-text-summarization'
     translation_en_to_de = 'translation_en_to_de'  # keep it underscore
     translation_en_to_ro = 'translation_en_to_ro'  # keep it underscore
     translation_en_to_fr = 'translation_en_to_fr'  # keep it underscore
@@ -376,6 +378,7 @@ class Preprocessors(object):
     re_tokenizer = 're-tokenizer'
     document_segmentation = 'document-segmentation'
     feature_extraction = 'feature-extraction'
+    mglm_summarization = 'mglm-summarization'
     sentence_piece = 'sentence-piece'
 
     # audio preprocessor
diff --git a/modelscope/models/nlp/__init__.py b/modelscope/models/nlp/__init__.py
index ccb2d382..1d71469a 100644
--- a/modelscope/models/nlp/__init__.py
+++ b/modelscope/models/nlp/__init__.py
@@ -35,6 +35,7 @@ if TYPE_CHECKING:
         SbertTokenizerFast,
     )
     from .T5 import T5ForConditionalGeneration
+    from .mglm import MGLMForTextSummarization
     from .task_models import (
         FeatureExtractionModel,
         InformationExtractionModel,
@@ -106,6 +107,7 @@ else:
         ],
         'sentence_embedding': ['SentenceEmbedding'],
         'T5': ['T5ForConditionalGeneration'],
+        'mglm': ['MGLMForTextSummarization'],
         'gpt_neo': ['GPTNeoModel'],
         'bloom': ['BloomModel'],
     }
diff --git a/modelscope/models/nlp/mglm/__init__.py b/modelscope/models/nlp/mglm/__init__.py
new file mode 100644
index 00000000..26d1101b
--- /dev/null
+++ b/modelscope/models/nlp/mglm/__init__.py
@@ -0,0 +1,22 @@
+# Modified by Zhipu.AI
+# Original Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .mglm_for_text_summarization import mGlmForSummarization
+else:
+    _import_structure = {
+        'mglm_for_text_summarization': ['MGLMForTextSummarization'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/nlp/mglm/arguments.py b/modelscope/models/nlp/mglm/arguments.py
new file mode 100755
index 00000000..13b3aeab
--- /dev/null
+++ b/modelscope/models/nlp/mglm/arguments.py
@@ -0,0 +1,793 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""argparser configuration"""
+
+import argparse
+import os
+
+import deepspeed
+import json
+import torch
+
+from .utils import get_hostname
+
+
+def add_model_config_args(parser):
+    """Model arguments"""
+
+    group = parser.add_argument_group('model', 'model configuration')
+
+    group.add_argument(
+        '--transformer-xl',
+        action='store_true',
+        help='use transformer-xl for training')
+    group.add_argument(
+        '--pretrained-bert',
+        action='store_true',
+        help='use a pretrained bert-large-uncased model instead'
+        'of initializing from scratch. See '
+        '--tokenizer-model-type to specify which pretrained '
+        'BERT model to use')
+    group.add_argument(
+        '--encoder-decoder',
+        action='store_true',
+        help='use the encoder-decoder architecture for blocklm')
+    group.add_argument(
+        '--attention-dropout',
+        type=float,
+        default=0.1,
+        help='dropout probability for attention weights')
+    group.add_argument(
+        '--num-attention-heads',
+        type=int,
+        default=16,
+        help='num of transformer attention heads')
+    group.add_argument(
+        '--hidden-size', type=int, default=1024, help='tansformer hidden size')
+    group.add_argument(
+        '--intermediate-size',
+        type=int,
+        default=None,
+        help='transformer embedding dimension for FFN'
+        'set to 4*`--hidden-size` if it is None')
+    group.add_argument(
+        '--num-layers', type=int, default=24, help='num decoder layers')
+    group.add_argument(
+        '--layernorm-epsilon',
+        type=float,
+        default=1e-5,
+        help='layer norm epsilon')
+    group.add_argument(
+        '--hidden-dropout',
+        type=float,
+        default=0.1,
+        help='dropout probability for hidden state transformer')
+    group.add_argument(
+        '--output-dropout',
+        type=float,
+        default=0.1,
+        help='dropout probability for pooled output')
+    group.add_argument(
+        '--max-position-embeddings',
+        type=int,
+        default=512,
+        help='maximum number of position embeddings to use')
+    group.add_argument(
+        '--vocab-size',
+        type=int,
+        default=250112,
+        help='vocab size to use for non-character-level '
+        'tokenization. This value will only be used when '
+        'creating a tokenizer')
+    group.add_argument(
+        '--deep-init',
+        action='store_true',
+        help='initialize bert model similar to gpt2 model.'
+        'scales initialization of projection layers by a '
+        'factor of 1/sqrt(2N). Necessary to train bert '
+        'models larger than BERT-Large.')
+    group.add_argument(
+        '--make-vocab-size-divisible-by',
+        type=int,
+        default=128,
+        help='Pad the vocab size to be divisible by this value.'
+        'This is added for computational efficieny reasons.')
+    group.add_argument(
+        '--cpu-optimizer', action='store_true', help='Run optimizer on CPU')
+    group.add_argument(
+        '--cpu_torch_adam',
+        action='store_true',
+        help='Use Torch Adam as optimizer on CPU.')
+
+    return parser
+
+
+def add_fp16_config_args(parser):
+    """Mixed precision arguments."""
+
+    group = parser.add_argument_group('fp16', 'fp16 configurations')
+
+    group.add_argument(
+        '--fp16', action='store_true', help='Run model in fp16 mode')
+    group.add_argument(
+        '--fp32-embedding', action='store_true', help='embedding in fp32')
+    group.add_argument(
+        '--fp32-layernorm', action='store_true', help='layer norm in fp32')
+    group.add_argument(
+        '--fp32-tokentypes',
+        action='store_true',
+        help='embedding token types in fp32')
+    group.add_argument(
+        '--fp32-allreduce', action='store_true', help='all-reduce in fp32')
+    group.add_argument(
+        '--hysteresis',
+        type=int,
+        default=2,
+        help='hysteresis for dynamic loss scaling')
+    group.add_argument(
+        '--loss-scale',
+        type=float,
+        default=None,
+        help='Static loss scaling, positive power of 2 '
+        'values can improve fp16 convergence. If None, dynamic'
+        'loss scaling is used.')
+    group.add_argument(
+        '--loss-scale-window',
+        type=float,
+        default=1000,
+        help='Window over which to raise/lower dynamic scale')
+    group.add_argument(
+        '--min-scale',
+        type=float,
+        default=1,
+        help='Minimum loss scale for dynamic loss scale')
+    group.add_argument('--attention-scale', type=float, default=1.0)
+    return parser
+
+
+def add_training_args(parser):
+    """Training arguments."""
+
+    group = parser.add_argument_group('train', 'training configurations')
+
+    group.add_argument(
+        '--experiment-name',
+        type=str,
+        default='gpt-345M',
+        help='The experiment name for summary and checkpoint')
+    group.add_argument(
+        '--batch-size', type=int, default=4, help='Data Loader batch size')
+    group.add_argument(
+        '--gradient-accumulation-steps',
+        type=int,
+        default=1,
+        help='Data Loader batch size')
+    group.add_argument(
+        '--weight-decay',
+        type=float,
+        default=0.01,
+        help='weight decay coefficient for L2 regularization')
+    group.add_argument(
+        '--checkpoint-activations',
+        action='store_true',
+        help='checkpoint activation to allow for training '
+        'with larger models and sequences')
+    group.add_argument(
+        '--checkpoint-num-layers',
+        type=int,
+        default=1,
+        help='chunk size (number of layers) for checkpointing')
+    group.add_argument(
+        '--deepspeed-activation-checkpointing',
+        action='store_true',
+        help='uses activation checkpointing from deepspeed')
+    group.add_argument(
+        '--epochs',
+        type=int,
+        default=None,
+        help='Number of finetunning epochs. Zero results in evaluation only.')
+    group.add_argument(
+        '--clip-grad', type=float, default=1.0, help='gradient clipping')
+    group.add_argument(
+        '--train-iters',
+        type=int,
+        default=0,
+        help='total number of iterations to train over all training runs')
+    group.add_argument('--label-smoothing', type=float, default=0.0)
+    group.add_argument(
+        '--log-interval', type=int, default=100, help='report interval')
+    group.add_argument(
+        '--summary-dir',
+        type=str,
+        default='',
+        help='The directory to store the summary')
+    group.add_argument('--seed', type=int, default=1234, help='random seed')
+    # Batch producer arguments
+    group.add_argument(
+        '--reset-position-ids',
+        action='store_true',
+        help='Reset posistion ids after end-of-document token.')
+    group.add_argument(
+        '--reset-attention-mask',
+        action='store_true',
+        help='Reset self attention maske after '
+        'end-of-document token.')
+
+    # Learning rate.
+    group.add_argument(
+        '--lr-decay-iters',
+        type=int,
+        default=None,
+        help='number of iterations to decay LR over,'
+        ' If None defaults to `--train-iters`*`--epochs`')
+    group.add_argument(
+        '--lr-decay-style',
+        type=str,
+        default='linear',
+        choices=['constant', 'linear', 'cosine', 'exponential'],
+        help='learning rate decay function')
+    group.add_argument('--lr-decay-ratio', type=float, default=0.1)
+    group.add_argument(
+        '--lr', type=float, default=1.0e-4, help='initial learning rate')
+    group.add_argument(
+        '--warmup',
+        type=float,
+        default=0.01,
+        help='percentage of data to warmup on (.01 = 1% of all '
+        'training iters). Default 0.01')
+    group.add_argument(
+        '--switch-linear',
+        action='store_true',
+        help='Switch to linear decay for cosine decay')
+    # model checkpointing
+    group.add_argument(
+        '--save',
+        type=str,
+        default=None,
+        help='Output directory to save checkpoints to.')
+    group.add_argument('--new-save-directory', action='store_true')
+    group.add_argument(
+        '--save-epoch',
+        type=int,
+        default=1,
+        help='number of epochs between saves')
+    group.add_argument(
+        '--save-interval',
+        type=int,
+        default=5000,
+        help='number of iterations between saves')
+    group.add_argument(
+        '--no-save-optim',
+        action='store_true',
+        help='Do not save current optimizer.')
+    group.add_argument(
+        '--no-save-rng',
+        action='store_true',
+        help='Do not save current rng state.')
+    group.add_argument(
+        '--load',
+        type=str,
+        default=None,
+        help='Path to a directory containing a model checkpoint.')
+    group.add_argument(
+        '--no-load-optim',
+        action='store_true',
+        help='Do not load optimizer when loading checkpoint.')
+    group.add_argument(
+        '--no-load-rng',
+        action='store_true',
+        help='Do not load rng state when loading checkpoint.')
+    group.add_argument(
+        '--no-load-lr-scheduler',
+        action='store_true',
+        help='Do not load lr scheduler when loading checkpoint.')
+    group.add_argument(
+        '--no-deepspeed-load',
+        action='store_true',
+        help='Not use deepspeed when loading checkpoint')
+    group.add_argument(
+        '--finetune',
+        action='store_true',
+        help='Load model for finetuning. Do not load optimizer '
+        'or rng state from checkpoint and set iteration to 0. '
+        'Assumed when loading a release checkpoint.')
+    group.add_argument(
+        '--resume-dataloader',
+        action='store_true',
+        help='Resume the dataloader when resuming training. '
+        'Does not apply to tfrecords dataloader, try resuming'
+        'with a different seed in this case.')
+    # distributed training args
+    group.add_argument(
+        '--distributed-backend',
+        default='nccl',
+        help=
+        'which backend to use for distributed training. One of [gloo, nccl]',
+        choices=['nccl', 'gloo'])
+    group.add_argument(
+        '--DDP-impl',
+        default='torch',
+        choices=['local', 'torch', 'none'],
+        help='which DistributedDataParallel implementation to use.')
+
+    group.add_argument(
+        '--local_rank',
+        type=int,
+        default=None,
+        help='local rank passed from distributed launcher')
+    # BlockLM training args
+    group.add_argument(
+        '--block-lm',
+        action='store_true',
+        help='whether use the BlockLM pre-training')
+    group.add_argument(
+        '--masked-lm',
+        action='store_true',
+        help='whether to use the mlm objective')
+    group.add_argument('--bert-prob', type=float, default=0.5)
+    group.add_argument('--gpt-infill-prob', type=float, default=0.5)
+    group.add_argument('--gpt-min-ratio', type=float, default=0.5)
+    group.add_argument('--gap-sentence-prob', type=float, default=0.0)
+    group.add_argument('--gap-sentence-ratio', type=float, default=0.15)
+    group.add_argument('--avg-block-length', type=int, default=3)
+    group.add_argument('--short-seq-prob', type=float, default=0.0)
+    group.add_argument('--single-span-prob', type=float, default=0.0)
+    group.add_argument(
+        '--task-mask',
+        action='store_true',
+        help='Use different mask for generation and blank filling')
+    group.add_argument(
+        '--no-shuffle-block',
+        action='store_true',
+        help='not shuffle the blocks when filling the blank')
+    group.add_argument(
+        '--no-block-position',
+        action='store_true',
+        help='Use (rough) absolute positions instead of block positions')
+    group.add_argument(
+        '--sentinel-token',
+        action='store_true',
+        help='Use sentinel (mask) tokens to replace 2d position encoding')
+    group.add_argument('--block-mask-prob', type=float, default=0.0)
+    group.add_argument('--context-mask-ratio', type=float, default=0.0)
+    group.add_argument(
+        '--random-position',
+        action='store_true',
+        help='Use random start position to cover all the position embeddings')
+    return parser
+
+
+def add_evaluation_args(parser):
+    """Evaluation arguments."""
+
+    group = parser.add_argument_group('validation',
+                                      'validation configurations')
+
+    group.add_argument(
+        '--eval-batch-size',
+        type=int,
+        default=None,
+        help='Data Loader batch size for evaluation datasets.'
+        'Defaults to `--batch-size`')
+    group.add_argument(
+        '--eval-iters',
+        type=int,
+        default=100,
+        help='number of iterations to run for evaluation'
+        'validation/test for')
+    group.add_argument(
+        '--eval-interval',
+        type=int,
+        default=1000,
+        help='interval between running evaluation on validation set')
+    group.add_argument(
+        '--eval-epoch',
+        type=int,
+        default=1,
+        help='epoch between running evaluation on validation set')
+    group.add_argument(
+        '--eval-seq-length',
+        type=int,
+        default=None,
+        help='Maximum sequence length to process for '
+        'evaluation. Defaults to `--seq-length`')
+    group.add_argument(
+        '--eval-max-preds-per-seq',
+        type=int,
+        default=None,
+        help='Maximum number of predictions to use for '
+        'evaluation. Defaults to '
+        'math.ceil(`--eval-seq-length`*.15/10)*10')
+    group.add_argument('--overlapping-eval', type=int, default=32)
+
+    return parser
+
+
+def add_text_generate_args(parser):
+    """Text generate arguments."""
+
+    group = parser.add_argument_group('Text generation', 'configurations')
+    group.add_argument('--temperature', type=float, default=1.0)
+    group.add_argument('--top_p', type=float, default=0.0)
+    group.add_argument('--top_k', type=int, default=0)
+    group.add_argument('--out-seq-length', type=int, default=256)
+    group.add_argument('--num-beams', type=int, default=1)
+    group.add_argument('--length-penalty', type=float, default=0.0)
+    group.add_argument('--no-repeat-ngram-size', type=int, default=0)
+    group.add_argument('--min-tgt-length', type=int, default=0)
+    group.add_argument('--select-topk', action='store_true')
+    group.add_argument('--blank-maskratio', type=float, default=0.1)
+    return parser
+
+
+def add_data_args(parser):
+    """Train/valid/test data arguments."""
+
+    group = parser.add_argument_group('data', 'data configurations')
+
+    group.add_argument(
+        '--model-parallel-size',
+        type=int,
+        default=1,
+        help='size of the model parallel.')
+    group.add_argument(
+        '--shuffle',
+        action='store_true',
+        help='Shuffle data. Shuffling is deterministic '
+        'based on seed and current epoch.')
+    group.add_argument('--filter-english', action='store_true')
+    group.add_argument(
+        '--train-data',
+        nargs='+',
+        default=None,
+        help='Whitespace separated filenames or corpora names '
+        'for training.')
+    group.add_argument(
+        '--valid-data',
+        nargs='*',
+        default=None,
+        help="""Filename for validation data.""")
+    group.add_argument(
+        '--test-data',
+        nargs='*',
+        default=None,
+        help="""Filename for testing""")
+    group.add_argument(
+        '--data-dir',
+        type=str,
+        default=None,
+        help='The data path to all the data files')
+    group.add_argument(
+        '--input-data-sizes-file',
+        type=str,
+        default='sizes.txt',
+        help='the filename containing all the shards sizes')
+
+    group.add_argument(
+        '--delim', default=',', help='delimiter used to parse csv data files')
+    group.add_argument(
+        '--text-key',
+        default='sentence',
+        help='key to use to extract text from json/csv')
+    group.add_argument(
+        '--eval-text-key',
+        default=None,
+        help='key to use to extract text from '
+        'json/csv evaluation datasets')
+    group.add_argument(
+        '--split',
+        default='1000,1,1',
+        help='comma-separated list of proportions for training,'
+        ' validation, and test split')
+
+    group.add_argument(
+        '--no-lazy-loader',
+        action='store_true',
+        help='whether to lazy read the data set')
+    group.add_argument('--half-lazy-loader', action='store_true')
+    group.add_argument(
+        '--loader-scatter',
+        type=int,
+        default=None,
+        help='Number of scatters to use for dataloaders')
+    group.add_argument(
+        '--loose-json',
+        action='store_true',
+        help='Use loose json (one json-formatted string per '
+        'newline), instead of tight json (data file is one '
+        'json string)')
+    group.add_argument(
+        '--presplit-sentences',
+        action='store_true',
+        help='Dataset content consists of documents where '
+        'each document consists of newline separated sentences')
+    group.add_argument(
+        '--num-workers',
+        type=int,
+        default=2,
+        help="""Number of workers to use for dataloading""")
+    group.add_argument(
+        '--tokenizer-model-type',
+        type=str,
+        default=None,
+        help="Model type to use for sentencepiece tokenization \
+                       (one of ['bpe', 'char', 'unigram', 'word']) or \
+                       bert vocab to use for BertWordPieceTokenizer (one of \
+                       ['bert-large-uncased', 'bert-large-cased', etc.])")
+    group.add_argument(
+        '--tokenizer-path',
+        type=str,
+        default='tokenizer.model',
+        help='path used to save/load sentencepiece tokenization '
+        'models')
+    group.add_argument(
+        '--tokenizer-type',
+        type=str,
+        default='BertWordPieceTokenizer',
+        choices=[
+            'CharacterLevelTokenizer', 'SentencePieceTokenizer',
+            'BertWordPieceTokenizer', 'GPT2BPETokenizer', 'ChineseSPTokenizer'
+        ],
+        help='what type of tokenizer to use')
+    group.add_argument('--no-pre-tokenize', action='store_true')
+    group.add_argument(
+        '--cache-dir',
+        default=None,
+        type=str,
+        help='Where to store pre-trained BERT downloads')
+    group.add_argument(
+        '--use-tfrecords',
+        action='store_true',
+        help='load `--train-data`, `--valid-data`, '
+        '`--test-data` from BERT tf records instead of '
+        'normal data pipeline')
+    group.add_argument(
+        '--seq-length',
+        type=int,
+        default=512,
+        help='Maximum sequence length to process')
+    group.add_argument(
+        '--mem-length',
+        type=int,
+        default=0,
+        help='The memory length to preserve')
+    group.add_argument(
+        '--max-preds-per-seq',
+        type=int,
+        default=None,
+        help='Maximum number of predictions to use per sequence.'
+        'Defaults to math.ceil(`--seq-length`*.15/10)*10.'
+        'MUST BE SPECIFIED IF `--use-tfrecords` is True.')
+    group.add_argument('--non-sentence-start', type=float, default=0.0)
+    group.add_argument(
+        '--sample-one-document',
+        action='store_true',
+        help='only sample one document in one sample')
+    group.add_argument(
+        '--load-splits',
+        type=str,
+        default=None,
+        help='The path to load split indices from')
+    group.add_argument(
+        '--save-splits',
+        type=str,
+        default=None,
+        help='The path to save split indices to')
+    group.add_argument(
+        '--save-test-data',
+        type=str,
+        default=None,
+        help='The path to save the test data')
+    group.add_argument(
+        '--multi-task-data',
+        nargs='*',
+        default=None,
+        help='Downsteam task names for multi-task pre-training')
+    group.add_argument(
+        '--multi-task-ratio',
+        type=float,
+        default=0.0,
+        help='Ratio for multi-task pre-training')
+    group.add_argument('--multi-seq-length', type=int, default=None)
+    group.add_argument('--multi-batch-size', type=int, default=None)
+    return parser
+
+
+def add_finetune_config_args(parser):
+    group = parser.add_argument_group('finetune', 'finetune configurations')
+    group.add_argument('--task', type=str, help='Task name.')
+    group.add_argument(
+        '--load-pretrained',
+        type=str,
+        help='Load pretrained model',
+        default=None)
+    group.add_argument(
+        '--pool-token',
+        type=str,
+        choices=['start', 'pad', 'cls'],
+        help='The token to pool the sequence representation',
+        default='cls')
+    group.add_argument(
+        '--cloze-eval',
+        action='store_true',
+        help='Evaluation dataset with cloze task')
+    group.add_argument(
+        '--multi-token',
+        action='store_true',
+        help='Use multi token for cloze evaluation')
+    group.add_argument(
+        '--segment-length',
+        type=int,
+        default=0,
+        help='The maximum segment length for cloze evaluation')
+    group.add_argument(
+        '--loss-func',
+        type=str,
+        choices=['cross_entropy', 'hinge', 'generative', 'mix'],
+        default='cross_entropy')
+    group.add_argument('--block-lm-ratio', type=float, default=0.0)
+    group.add_argument(
+        '--adapet',
+        action='store_true',
+        help='Use the decoupled cross entropy loss in AdaPET')
+    group.add_argument('--pattern-id', type=int, default=0)
+    group.add_argument(
+        '--fast-decode',
+        action='store_true',
+        help=
+        'Fast decode for multi-token cloze. Can only be used without checkpoint activation.'
+    )
+    group.add_argument('--few-superglue', action='store_true')
+    group.add_argument(
+        '--eval-valid',
+        action='store_true',
+        help='Whether evaluate on the valid set')
+    group.add_argument('--validation-metric', type=str, default=None)
+    group.add_argument(
+        '--unidirectional',
+        action='store_true',
+        help='Use the left to right language model')
+    group.add_argument('--src-seq-length', type=int, default=None)
+    group.add_argument('--tgt-seq-length', type=int, default=None)
+    group.add_argument('--adam-beta1', type=float, default=0.9)
+    group.add_argument('--adam-beta2', type=float, default=0.999)
+    group.add_argument('--adam-eps', type=float, default=1e-8)
+    group.add_argument(
+        '--optimizer', type=str, choices=['adam', 'adafactor'], default='adam')
+    group.add_argument('--wsc-negative', action='store_true')
+    group.add_argument('--overwrite', action='store_true')
+    group.add_argument('--no-validation', action='store_true')
+    # Continuous prompt arguments
+    group.add_argument(
+        '--continuous-prompt',
+        action='store_true',
+        help='Use continuous prompt for PET')
+    group.add_argument('--num-prompt-tokens', type=int, default=0)
+    group.add_argument(
+        '--prompt-func', default='lstm', choices=['lstm', 'mlp', 'none'])
+    group.add_argument(
+        '--freeze-transformer', action='store_true', default=False)
+    group.add_argument('--tune-prefix-layers', type=int, default=None)
+    group.add_argument('--prefix-prompt', type=int, default=0)
+    group.add_argument('--prompt-init', action='store_true', default=False)
+    return parser
+
+
+def get_args():
+    """Parse all the args."""
+
+    parser = argparse.ArgumentParser(description='PyTorch BERT Model')
+    parser = add_model_config_args(parser)
+    parser = add_fp16_config_args(parser)
+    parser = add_training_args(parser)
+    parser = add_evaluation_args(parser)
+    parser = add_text_generate_args(parser)
+    parser = add_data_args(parser)
+    parser = add_finetune_config_args(parser)
+
+    # Include DeepSpeed configuration arguments
+    parser = deepspeed.add_config_arguments(parser)
+
+    args = parser.parse_args(args=[])
+    if not args.train_data and not args.data_dir:
+        print('WARNING: No training data specified')
+
+    args.cuda = torch.cuda.is_available()
+
+    args.rank = int(os.getenv('RANK', '0'))
+    args.world_size = int(os.getenv('WORLD_SIZE', '1'))
+    if hasattr(args, 'deepspeed_mpi') and args.deepspeed_mpi:
+        mpi_define_env(args)
+    elif os.getenv('OMPI_COMM_WORLD_LOCAL_RANK'):
+        # We are using (OpenMPI) mpirun for launching distributed data parallel processes
+        local_rank = int(os.getenv('OMPI_COMM_WORLD_LOCAL_RANK'))
+        local_size = int(os.getenv('OMPI_COMM_WORLD_LOCAL_SIZE'))
+
+        # Possibly running with Slurm
+        num_nodes = int(os.getenv('SLURM_JOB_NUM_NODES', '1'))
+        nodeid = int(os.getenv('SLURM_NODEID', '0'))
+
+        args.local_rank = local_rank
+        args.rank = nodeid * local_size + local_rank
+        args.world_size = num_nodes * local_size
+
+    args.model_parallel_size = min(args.model_parallel_size, args.world_size)
+    if args.rank == 0:
+        print('using world size: {} and model-parallel size: {} '.format(
+            args.world_size, args.model_parallel_size))
+
+    args.dynamic_loss_scale = False
+    if args.loss_scale is None:
+        args.dynamic_loss_scale = True
+        if args.rank == 0:
+            print(' > using dynamic loss scaling')
+
+    # The args fp32_* or fp16_* meant to be active when the
+    # args fp16 is set. So the default behaviour should all
+    # be false.
+    if not args.fp16:
+        args.fp32_embedding = False
+        args.fp32_tokentypes = False
+        args.fp32_layernorm = False
+
+    if hasattr(args, 'deepspeed'
+               ) and args.deepspeed and args.deepspeed_config is not None:
+        with open(args.deepspeed_config) as file:
+            deepspeed_config = json.load(file)
+        if 'train_micro_batch_size_per_gpu' in deepspeed_config:
+            args.batch_size = deepspeed_config[
+                'train_micro_batch_size_per_gpu']
+        if 'gradient_accumulation_steps' in deepspeed_config:
+            args.gradient_accumulation_steps = deepspeed_config[
+                'gradient_accumulation_steps']
+        else:
+            args.gradient_accumulation_steps = 1
+        if 'optimizer' in deepspeed_config:
+            optimizer_params_config = deepspeed_config['optimizer'].get(
+                'params', {})
+            args.lr = optimizer_params_config.get('lr', args.lr)
+            args.weight_decay = optimizer_params_config.get(
+                'weight_decay', args.weight_decay)
+    return args
+
+
+def mpi_define_env(args):
+    from mpi4py import MPI
+    comm = MPI.COMM_WORLD
+    rank = comm.Get_rank()
+    world_size = comm.Get_size()
+
+    master_addr = None
+    if rank == 0:
+        master_addr = get_hostname()
+    master_addr = comm.bcast(master_addr, root=0)
+
+    # Determine local rank by assuming hostnames are unique
+    proc_name = MPI.Get_processor_name()
+    all_procs = comm.allgather(proc_name)
+    local_rank = sum([i == proc_name for i in all_procs[:rank]])
+
+    os.environ['RANK'] = str(rank)
+    os.environ['WORLD_SIZE'] = str(world_size)
+    args.local_rank = local_rank
+    args.world_size = world_size
+    args.rank = rank
+    os.environ['MASTER_ADDR'] = master_addr
+    os.environ[
+        'MASTER_PORT'] = '29500'  # TORCH_DISTRIBUTED_DEFAULT_PORT = 29500
+
+    print(
+        'Discovered MPI settings of world_rank={}, local_rank={}, world_size={}, master_addr={}, master_port={}'
+        .format(os.environ['RANK'], args.local_rank, os.environ['WORLD_SIZE'],
+                os.environ['MASTER_ADDR'], os.environ['MASTER_PORT']))
diff --git a/modelscope/models/nlp/mglm/blocklm_utils.py b/modelscope/models/nlp/mglm/blocklm_utils.py
new file mode 100644
index 00000000..9af83f67
--- /dev/null
+++ b/modelscope/models/nlp/mglm/blocklm_utils.py
@@ -0,0 +1,625 @@
+# Copyright (c) 2022 Zhipu.AI
+
+import copy
+import math
+import random
+
+import numpy as np
+import torch
+import torch.utils.data
+from scipy.stats import poisson
+
+from . import mpu
+from .utils import print_rank_0
+
+
+def rindex(lst, val, start=None):
+    if start is None:
+        start = len(lst) - 1
+    for i in range(start, -1, -1):
+        if lst[i] == val:
+            return i
+    return -1
+
+
+def index_in_list(lst, val, start=None):
+    if start is None:
+        start = 0
+    for i in range(start, len(lst)):
+        if lst[i] == val:
+            return i
+    return -1
+
+
+class ConstructBlockStrategy:
+
+    def __init__(self,
+                 args,
+                 tokenizer,
+                 max_seq_length,
+                 bert_prob=1.0,
+                 gap_sentence_prob=0.0,
+                 gpt_infill_prob=0.5,
+                 gpt_min_ratio=0.5,
+                 bert_ratio=0.15,
+                 gap_sentence_ratio=0.15,
+                 average_block_length=3,
+                 max_block_length=40,
+                 block_mask_prob=0.0,
+                 context_mask_ratio=0.0,
+                 context_mask_range=3,
+                 short_seq_prob=0.0,
+                 single_span_prob=0.0,
+                 block_position_encoding=True,
+                 encoder_decoder=False,
+                 shuffle_blocks=True,
+                 sentinel_token=False,
+                 task_mask=False,
+                 random_position=False,
+                 masked_lm=False):
+        self.eod_token = args.eod_token
+        self.tokenizer = tokenizer
+        self.count = 0
+        self.max_seq_length = max_seq_length
+        self.rank = mpu.get_data_parallel_rank()
+        self.world_size = mpu.get_data_parallel_world_size()
+        # self.rank = 0
+        # self.world_size = 1
+        assert 0.0 <= bert_prob <= 1.0
+        self.bert_prob = bert_prob
+        self.gap_sentence_prob = gap_sentence_prob
+        self.gpt_prob = 1 - bert_prob - gap_sentence_prob
+        assert self.gpt_prob >= -1e-10
+        self.infill_prob = gpt_infill_prob
+        self.gpt_min_ratio = gpt_min_ratio
+        self.bert_ratio = bert_ratio
+        self.gap_sentence_ratio = gap_sentence_ratio
+        self.block_length_distribution = [
+            poisson.pmf(i, average_block_length)
+            for i in range(1, max_block_length)
+        ]
+        self.block_mask_prob = block_mask_prob
+        self.context_mask_ratio = context_mask_ratio
+        self.context_mask_range = context_mask_range
+        self.short_seq_prob = short_seq_prob
+        self.single_span_prob = single_span_prob
+        self.block_position_encoding = block_position_encoding
+        self.encoder_decoder = encoder_decoder
+        self.shuffle_blocks = shuffle_blocks
+        self.sentinel_token = sentinel_token
+        self.generation_mask = 'gMASK' if task_mask else 'MASK'
+        self.generation_mask = self.tokenizer.get_command(
+            self.generation_mask).Id
+        self.gap_sentence_mask = 'sMASK' if task_mask else 'MASK'
+        self.gap_sentence_mask = self.tokenizer.get_command(
+            self.gap_sentence_mask).Id
+        self.random_position = random_position
+        self.masked_lm = masked_lm
+        print_rank_0(
+            f'BERT prob {self.bert_prob}, gap sent prob {self.gap_sentence_prob}, GPT prob {self.gpt_prob}, infill prob {self.infill_prob}'  # noqa
+        )
+        print_rank_0(
+            f'generation min ratio {self.gpt_min_ratio}, block ratio {self.bert_ratio}, gap sent ratio {self.gap_sentence_ratio}'  # noqa
+        )
+        print_rank_0(
+            f'block length distribution {self.block_length_distribution}')
+        print_rank_0(
+            f'block mask prob {self.block_mask_prob}, context mask ratio {self.context_mask_ratio}'
+        )
+
+    def contains_sentence_end(self, tok):
+        tok = self.tokenizer.IdToToken(tok)
+        if '.' in tok:
+            return True
+        if '?' in tok:
+            return True
+        if '!' in tok:
+            return True
+        if ';' in tok:
+            return True
+        if ':' in tok:
+            return True
+        if '。' in tok:
+            return True
+        if '？' in tok:
+            return True
+        if '！' in tok:
+            return True
+        if '；' in tok:
+            return True
+        if '…' in tok:
+            return True
+        if '\n' in tok:
+            return True
+        return False
+
+    @staticmethod
+    def sample_spans(span_lengths, total_length, rng, offset=0):
+        blank_length = total_length - sum(span_lengths)
+        m = blank_length - len(span_lengths) + 1
+        places = [rng.randrange(m + 1) for _ in range(len(span_lengths))]
+        places.sort()
+        spans = []
+        for place, span_length in zip(places, span_lengths):
+            start = offset + place
+            end = offset + place + span_length
+            spans.append((start, end))
+            offset += span_length + 1
+        return spans
+
+    def sample_span_in_document(self, tokens, masked_lengths, rng):
+        rng.shuffle(masked_lengths)
+        mask_spans = []
+        mask_index = 0
+        indices = [-1] + np.where(tokens == self.eod_token)[0].tolist()
+        last_index = len(tokens)
+        documents = []
+        for index in reversed(indices):
+            start_index = index
+            if start_index + 1 < len(tokens) and tokens[
+                    start_index + 1] == self.tokenizer.get_command('ENC').Id:
+                start_index += 1
+            length = last_index - start_index - 1
+            if last_index == len(tokens) and length > 0:
+                length -= 1
+            documents.append((start_index + 1, length))
+            last_index = index
+        documents.sort(key=lambda x: x[1])
+        for i, (offset, length) in enumerate(documents):
+            if i == len(documents) - 1:
+                current_masked_length, current_count = 0, 0
+                while mask_index + current_count < len(
+                        masked_lengths
+                ) and masked_lengths[
+                        mask_index +  # noqa
+                        current_count] + current_masked_length + current_count <= length:
+                    current_masked_length += masked_lengths[mask_index
+                                                            + current_count]
+                    current_count += 1
+                if current_count > 0:
+                    spans = self.sample_spans(
+                        masked_lengths[mask_index:mask_index + current_count],
+                        length,
+                        rng,
+                        offset=offset)
+                    mask_spans += spans
+                if mask_index + current_count < len(masked_lengths) - 1:
+                    print(length, masked_lengths[mask_index:],
+                          masked_lengths[:mask_index], indices)
+            else:
+                current_masked_total = int(length * self.bert_ratio)
+                current_masked_length, current_count = 0, 0
+                while mask_index + current_count < len(
+                        masked_lengths
+                ) and masked_lengths[
+                        mask_index +  # noqa
+                        current_count] + current_masked_length <= current_masked_total:
+                    current_masked_length += masked_lengths[mask_index
+                                                            + current_count]
+                    current_count += 1
+                if current_count > 0:
+                    spans = self.sample_spans(
+                        masked_lengths[mask_index:mask_index + current_count],
+                        length,
+                        rng,
+                        offset=offset)
+                    mask_spans += spans
+                    mask_index += current_count
+        return mask_spans
+
+    def make_masked_data(self,
+                         tokens,
+                         loss_masks,
+                         attention_mask,
+                         block_spans,
+                         rng,
+                         task='bert'):
+        position_ids = np.arange(len(tokens), dtype=np.long)
+        targets = copy.deepcopy(tokens)
+        mask_id = self.tokenizer.get_command('MASK').Id
+        mlm_masks = np.zeros(len(tokens), dtype=np.long)
+        for start, end in block_spans:
+            for idx in range(start, end):
+                tokens[idx] = mask_id
+            mlm_masks[start:end] = 1
+        loss_masks = loss_masks * mlm_masks
+        return tokens, targets, loss_masks, position_ids
+
+    def make_block_data(self,
+                        tokens,
+                        loss_masks,
+                        attention_mask,
+                        block_spans,
+                        rng,
+                        task='bert'):
+        text_length = len(tokens)
+        position_ids = np.ones(len(tokens), dtype=np.long)
+        for start, end in block_spans:
+            position_ids[start + 1:end] = 0
+        position_ids = np.cumsum(position_ids) - 1
+        if self.random_position and position_ids[-1] < self.max_seq_length - 1:
+            position_bias = self.max_seq_length - position_ids[-1]
+            position_bias = rng.randrange(0, position_bias)
+            position_ids = position_ids + position_bias
+        if self.encoder_decoder or not self.shuffle_blocks:
+            block_spans.sort(key=lambda x: x[0])
+        else:
+            rng.shuffle(block_spans)
+        if self.sentinel_token:
+            block_spans = [(start, end, idx)
+                           for idx, (start, end) in enumerate(block_spans)]
+        else:
+            block_spans = [(start, end, 0) for start, end in block_spans]
+        target_tokens, target_position_ids, target_block_position_ids, targets = [], [], [], []
+        for start, end, idx in block_spans:
+            sop_token = 'sop' if idx == 0 else f'sop{idx}'
+            target_tokens.append([self.tokenizer.get_command(sop_token).Id])
+            span_tokens = copy.deepcopy(tokens[start:end])
+            if self.block_mask_prob > 0.0 and task == 'bert':
+                for sub_idx in range(len(span_tokens)):
+                    if random.random() < self.block_mask_prob:
+                        span_tokens[sub_idx] = self.tokenizer.get_command(
+                            'dBLOCK').Id
+            target_tokens.append(span_tokens)
+            targets.append(tokens[start:end])
+            targets.append([self.tokenizer.get_command('eop').Id])
+            if not self.sentinel_token:
+                target_position_id = position_ids[start:end]
+                target_position_ids.append(target_position_id)
+                target_position_ids.append([target_position_id[0]])
+            else:
+                target_position_ids.append([self.max_seq_length] *  # noqa
+                                           (end - start + 1))
+            if self.block_position_encoding:
+                target_block_position_ids.append(
+                    np.arange(1, end - start + 2, dtype=np.long))
+            else:
+                target_block_position_ids.append([1] * (end - start + 1))
+        block_spans.sort(key=lambda x: x[0])
+        source_tokens, source_position_ids, local_spans = [], [], []
+        last, current_length = 0, 0
+        for start, end, idx in block_spans:
+            if task == 'generation':
+                mask_id = self.generation_mask
+            elif task == 'gap_sentence':
+                mask_id = self.gap_sentence_mask
+            else:
+                mask_token = 'MASK' if idx == 0 else f'MASK{idx}'
+                mask_id = self.tokenizer.get_command(mask_token).Id
+            local_spans.append((current_length, current_length + start - last))
+            source_tokens.append(tokens[last:start])
+            source_tokens.append([mask_id])
+            source_position_ids.append(position_ids[last:start])
+            source_position_ids.append([position_ids[start]])
+            current_length += start - last + 1
+            last = end
+        if last < len(tokens):
+            local_spans.append(
+                (current_length, current_length + len(tokens) - last))
+            source_tokens.append(tokens[last:])
+            source_position_ids.append(position_ids[last:])
+        source_length = sum(map(len, source_tokens))
+        if attention_mask is not None:
+            assert source_length == attention_mask
+        if target_tokens and self.eod_token in np.concatenate(
+                target_tokens).tolist():
+            print('Found EOS in target', self.tokenizer.DecodeIds(tokens))
+            raise RuntimeError
+        if self.encoder_decoder:
+            target_tokens = target_tokens + [
+                self.tokenizer.get_command('eop').Id
+            ]
+            loss_masks = np.ones(len(target_tokens), dtype=np.long)
+            return source_tokens, target_tokens, loss_masks
+        else:
+            tokens = np.concatenate(source_tokens + target_tokens)
+            if task == 'bert' and self.context_mask_ratio > 0:
+                mask_candidates = set()
+                for start, end in local_spans:
+                    if start != 0:
+                        local_end = min(end, start + self.context_mask_range)
+                        mask_candidates.update(range(start, local_end))
+                    if end != 0:
+                        local_start = max(start, end - self.context_mask_range)
+                        mask_candidates.update(range(local_start, end))
+                mask_pos = rng.sample(
+                    mask_candidates,
+                    int(self.context_mask_ratio * text_length))
+                for pos in mask_pos:
+                    tokens[pos] = self.tokenizer.get_command('dBLOCK').Id
+            targets = np.concatenate(source_tokens + targets)
+            loss_masks = np.ones(len(tokens), dtype=np.long)
+            loss_masks[:source_length] = 0
+            position_ids = np.concatenate(source_position_ids
+                                          + target_position_ids)
+            block_position_ids = np.concatenate(
+                [np.zeros(source_length, dtype=np.long)]
+                + target_block_position_ids)
+            position_ids = np.stack([position_ids, block_position_ids], axis=0)
+            if attention_mask is not None:
+                return tokens, targets, loss_masks, position_ids
+            else:
+                return tokens, targets, loss_masks, position_ids, source_length
+
+    def generate_blank_data(self,
+                            sample,
+                            masked_lengths,
+                            attention_mask,
+                            rng,
+                            task='bert'):
+        rng.shuffle(masked_lengths)
+        tokens, loss_masks = sample['text'], sample['loss_mask']
+        assert tokens[0] == self.tokenizer.get_command('ENC').Id
+        block_spans = self.sample_span_in_document(tokens, masked_lengths, rng)
+        if len(block_spans) < len(masked_lengths):
+            return None
+        if self.masked_lm:
+            data = self.make_masked_data(tokens, loss_masks, attention_mask,
+                                         block_spans, rng)
+        else:
+            data = self.make_block_data(
+                tokens,
+                loss_masks,
+                attention_mask,
+                block_spans,
+                rng,
+                task=task)
+        return data
+
+    def split_samples(self, samples, rng):
+        target_length = rng.randrange(32, self.max_seq_length - 1)
+        num_splits = (self.max_seq_length - 1) // target_length
+        new_samples = []
+        cls_id = self.tokenizer.get_command('ENC').Id
+        eos_id = self.tokenizer.get_command('eos').Id
+        for sample in samples:
+            tokens, loss_masks = sample['text'][1:], sample['loss_mask'][1:]
+            for _ in range(num_splits):
+                if target_length >= len(tokens):
+                    new_tokens, new_loss_masks = tokens, loss_masks
+                else:
+                    random_start = rng.randrange(0,
+                                                 len(tokens) - target_length)
+                    while random_start > 0 and (
+                            tokens[random_start] == eos_id or  # noqa
+                            not (self.contains_sentence_end(  # noqa
+                                tokens[random_start - 1]) or  # noqa
+                                 tokens[random_start - 1] == eos_id)):  # noqa
+                        random_start -= 1
+                    random_end = random_start + target_length
+                    while random_end > random_start and not (
+                            self.contains_sentence_end(tokens[random_end - 1])
+                            or tokens[random_end - 1] == eos_id):
+                        random_end -= 1
+                    if random_end - random_start < target_length // 2:
+                        random_end = random_start + target_length
+                    new_tokens, new_loss_masks = tokens[
+                        random_start:random_end], loss_masks[
+                            random_start:random_end]
+                new_tokens = np.concatenate(([cls_id], new_tokens))
+                new_loss_masks = np.concatenate(([0], new_loss_masks))
+                new_samples.append({
+                    'text': new_tokens,
+                    'loss_mask': new_loss_masks
+                })
+        return new_samples
+
+    def construct_blocks(self, samples):
+        worker_info = torch.utils.data.get_worker_info()
+        if worker_info is not None:
+            worker_id, num_workers = worker_info.id, worker_info.num_workers
+        else:
+            worker_id, num_workers = 0, 1
+        rng = random.Random((self.count * num_workers + worker_id)
+                            * self.world_size + self.rank)
+        self.count += 1
+        token_batch, target_batch, loss_mask_batch, position_id_batch = [], [], [], []
+        source_batch, target_batch = [], []
+        if rng.random() < self.short_seq_prob:
+            samples = self.split_samples(samples, rng)
+        rand = rng.random()
+        single_span = rand < self.single_span_prob
+        rand = 0.0 if single_span else rng.random()
+        attention_mask = []
+        if rand < self.bert_prob:
+            mode = 'bert'
+            for sample in samples:
+                if single_span:
+                    masked_lengths = [
+                        rng.choices(
+                            range(1,
+                                  len(self.block_length_distribution) + 1),
+                            weights=self.block_length_distribution)[0]
+                    ]
+                    masked_count = masked_lengths[0]
+                else:
+                    masked_lengths, masked_count = [], 0
+                    while masked_count < int(
+                            self.bert_ratio * len(sample['text'])):
+                        block_length = rng.choices(
+                            range(1,
+                                  len(self.block_length_distribution) + 1),
+                            weights=self.block_length_distribution)[0]
+                        masked_lengths.append(block_length)
+                        masked_count += block_length
+                if self.masked_lm:
+                    sep = len(sample['text'])
+                else:
+                    sep = len(
+                        sample['text']) - masked_count + len(masked_lengths)
+                data = self.generate_blank_data(
+                    sample, masked_lengths, sep, rng, task='bert')
+                if data is not None:
+                    if self.encoder_decoder:
+                        source_tokens, target_tokens, loss_masks = data
+                        source_batch.append(source_tokens)
+                        target_batch.append(target_tokens)
+                        loss_mask_batch.append(loss_masks)
+                    else:
+                        tokens, targets, loss_masks, position_ids = data
+                        token_batch.append(tokens)
+                        target_batch.append(targets)
+                        loss_mask_batch.append(loss_masks)
+                        position_id_batch.append(position_ids)
+                    attention_mask.append(sep)
+
+        elif rand < self.bert_prob + self.gap_sentence_prob:
+            mode = 'sentence'
+            for sample in samples:
+                tokens, loss_masks = sample['text'], sample['loss_mask']
+                sentence_spans = []
+                last_index = 1 if tokens[0] == self.tokenizer.get_command(
+                    'ENC').Id else 0
+                for i in range(len(tokens)):
+                    if self.contains_sentence_end(tokens[i]):
+                        if last_index < i + 1:
+                            sentence_spans.append((last_index, i + 1))
+                        last_index = i + 1
+                    elif tokens[i] == self.tokenizer.get_command('eos').Id:
+                        last_index = i + 1
+                if last_index < len(tokens):
+                    sentence_spans.append((last_index, len(tokens)))
+                if not sentence_spans and torch.distributed.get_rank() == 0:
+                    try:
+                        print(self.tokenizer.DecodeIds(tokens[1:]))
+                    except IndexError:
+                        print(tokens[1:])
+                rng.shuffle(sentence_spans)
+                block_spans, block_length = [], 0
+                for start, end in sentence_spans:
+                    block_spans.append((start, end))
+                    block_length += end - start
+                    if block_length >= int(
+                            self.gap_sentence_ratio * len(tokens)):
+                        break
+                data = self.make_block_data(
+                    tokens,
+                    loss_masks,
+                    None,
+                    block_spans,
+                    rng,
+                    task='gap_sentence')
+                tokens, targets, loss_masks, position_ids, sep = data
+                token_batch.append(tokens)
+                target_batch.append(targets)
+                loss_mask_batch.append(loss_masks)
+                position_id_batch.append(position_ids)
+                attention_mask.append(sep)
+        else:
+            # start_indices = [index_in_list(sample['loss_mask'], 1) for sample in samples]
+            # end_indices = [rindex(sample['loss_mask'], 1) for sample in samples]
+            # start_index, end_index = max(start_indices), min(end_indices) - self.min_generation_length
+            # if end_index < start_index + 1:
+            #     end_index = start_index + 1
+            # division = rng.randrange(start_index, end_index)
+            mode = 'gpt'
+            max_generation_length = rng.randint(
+                int(self.gpt_min_ratio
+                    * min(map(lambda x: len(x['text']), samples))),
+                max(map(lambda x: len(x['text']), samples)) - 2)
+            for sample in samples:
+                generation_length = min(max_generation_length,
+                                        len(sample['text']) - 2)
+                attention_mask.append(
+                    len(sample['text']) - generation_length + 1)
+                multiple_doc = index_in_list(
+                    sample['text'],
+                    self.tokenizer.get_command('eos').Id) not in [
+                        -1, len(sample['text']) - 1
+                    ]  # noqa
+                if multiple_doc or rng.random() < self.infill_prob:
+                    division = len(sample['text']) - generation_length
+                    tokens, loss_masks = sample['text'], sample['loss_mask']
+                    source_tokens, target_tokens = tokens[:division], tokens[
+                        division:]
+                    target_masks = loss_masks[division:]
+                    tokens = np.concatenate((source_tokens, [
+                        self.generation_mask,
+                        self.tokenizer.get_command('sop').Id
+                    ], target_tokens[:-1]))
+                    targets = np.concatenate(
+                        (source_tokens, [self.generation_mask], target_tokens))
+                    loss_masks = np.concatenate(
+                        (np.zeros(len(source_tokens) + 1,
+                                  dtype=np.long), target_masks))
+                    token_batch.append(tokens)
+                    target_batch.append(targets)
+                    loss_mask_batch.append(loss_masks)
+                    position_ids = np.arange(
+                        len(source_tokens) + len(target_tokens) + 1,
+                        dtype=np.long)
+                    position_ids[len(source_tokens) + 1:] = len(source_tokens)
+                    if self.block_position_encoding:
+                        block_position_ids = np.concatenate(
+                            (np.zeros(len(source_tokens), dtype=np.long),
+                             np.arange(len(target_tokens) + 1, dtype=np.long)))
+                    else:
+                        block_position_ids = np.concatenate(
+                            (np.zeros(len(source_tokens) + 1, dtype=np.long),
+                             np.ones(len(target_tokens) + 1, dtype=np.long)))
+                    position_id_batch.append(
+                        np.stack([position_ids, block_position_ids], axis=0))
+                else:
+                    tokens, targets, loss_masks, position_ids = self.generate_blank_data(
+                        sample, [generation_length],
+                        attention_mask[-1],
+                        rng,
+                        task='generation')
+                    token_batch.append(tokens)
+                    target_batch.append(targets)
+                    loss_mask_batch.append(loss_masks)
+                    position_id_batch.append(position_ids)
+                    if tokens is None:
+                        print(sample, generation_length, multiple_doc)
+        if self.encoder_decoder:
+            return {
+                'text': torch.tensor(source_batch, dtype=torch.long),
+                'target': torch.tensor(target_batch, dtype=torch.long),
+                'loss_mask': torch.tensor(loss_mask_batch, dtype=torch.long)
+            }
+        else:
+            token_batch, target_batch, loss_mask_batch, position_id_batch = self.pad_batch(
+                token_batch, target_batch, loss_mask_batch, position_id_batch)
+            return {
+                'text': torch.tensor(token_batch, dtype=torch.long),
+                'target': torch.tensor(target_batch, dtype=torch.long),
+                'loss_mask': torch.tensor(loss_mask_batch, dtype=torch.long),
+                'position_id':
+                torch.tensor(position_id_batch, dtype=torch.long),
+                'attention_mask':
+                torch.tensor(attention_mask, dtype=torch.long),
+                'mode': mode
+            }
+
+    @staticmethod
+    def pad_batch(token_batch, target_batch, loss_mask_batch,
+                  position_id_batch):
+        seq_lengths = list(map(len, token_batch))
+        if seq_lengths.count(seq_lengths[0]) != len(seq_lengths):
+            max_length = max(seq_lengths)
+            token_batch = [
+                np.concatenate(
+                    (tokens, np.zeros(max_length - len(tokens),
+                                      dtype=np.long)))
+                for tokens in token_batch
+            ]
+            target_batch = [
+                np.concatenate(
+                    (targets,
+                     np.zeros(max_length - len(targets), dtype=np.long)))
+                for targets in target_batch
+            ]
+            loss_mask_batch = [
+                np.concatenate(
+                    (loss_masks,
+                     np.zeros(max_length - len(loss_masks), dtype=np.long)))
+                for loss_masks in loss_mask_batch
+            ]
+            position_id_batch = [
+                np.concatenate((position_ids,
+                                np.zeros(
+                                    (2, max_length - position_ids.shape[1]),
+                                    dtype=np.long)),
+                               axis=1) for position_ids in position_id_batch
+            ]
+        return token_batch, target_batch, loss_mask_batch, position_id_batch
diff --git a/modelscope/models/nlp/mglm/configure_data.py b/modelscope/models/nlp/mglm/configure_data.py
new file mode 100644
index 00000000..6921de08
--- /dev/null
+++ b/modelscope/models/nlp/mglm/configure_data.py
@@ -0,0 +1,513 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""parses arguments and preps data loader"""
+
+import copy
+import os
+import random
+from bisect import bisect_right
+from itertools import accumulate
+
+import numpy as np
+import torch
+import torch.utils.data
+
+from . import data_utils, mpu
+from .blocklm_utils import ConstructBlockStrategy
+from .data_utils.tokenization import make_tokenizer
+from .utils import print_rank_0
+
+
+class MultiTaskDataset(torch.utils.data.Dataset):
+
+    def __init__(self,
+                 tasks,
+                 datasets,
+                 reweight=True,
+                 temperature=0.8,
+                 max_limit=200000):
+        super(MultiTaskDataset, self).__init__()
+        self.tasks = tasks
+        self.datasets = datasets
+        self.reweight = reweight
+        self.temperature = temperature
+        self.lens = [len(dataset) for dataset in datasets]
+        self.weights = np.array(
+            [min(length, max_limit)**temperature for length in self.lens])
+        self.total_len = sum(self.lens)
+        self.cumulative_lens = list(accumulate(self.lens))
+        if self.reweight:
+            print_rank_0(list(zip(self.tasks, self.lens, self.weights)))
+        else:
+            print_rank_0(list(zip(self.tasks, self.lens)))
+        self.weights /= self.weights.sum()
+
+    def __len__(self):
+        return self.total_len * 1000
+
+    @staticmethod
+    def pet_wrapper(data):
+        text = data['text']
+        loss_mask = data['logit_mask']
+        target = data['target']
+        attention_mask = data['mask']
+        position_id = data['position']
+        label = data['label']
+        if len(text.shape) == 2:
+            text = text[label]
+            loss_mask = loss_mask[label]
+            target = target[label]
+            attention_mask = attention_mask[label]
+            position_id = position_id[label]
+        else:
+            target = target[label]
+        if not target.shape:
+            target = target.repeat(len(text))
+        return {
+            'text': text,
+            'target': target,
+            'loss_mask': loss_mask,
+            'position_id': position_id,
+            'attention_mask': attention_mask
+        }
+
+    def __getitem__(self, idx):
+        if self.reweight:
+            rng = random.Random(idx)
+            rng = np.random.RandomState(
+                seed=[rng.randint(0, 2**32 - 1) for _ in range(16)])
+            dataset_idx = rng.choice(
+                np.arange(len(self.datasets)), p=self.weights)
+            dataset = self.datasets[dataset_idx]
+            sample_idx = rng.choice(np.arange(len(dataset)))
+            item = self.datasets[dataset_idx][sample_idx]
+        else:
+            dataset_idx = bisect_right(self.cumulative_lens, idx)
+            if dataset_idx == 0:
+                sample_idx = idx
+            else:
+                sample_idx = idx - self.cumulative_lens[dataset_idx - 1]
+            item = self.datasets[dataset_idx][sample_idx]
+        item = self.pet_wrapper(item)
+        return item
+
+
+class DataConfig:
+
+    def __init__(self, defaults=None):
+        super(DataConfig, self).__init__()
+        if defaults is None:
+            defaults = {}
+        self.defaults = defaults
+
+    def apply(self, args, tokenizer):
+        if torch.distributed.get_rank() == 0:
+            print('configuring data')
+        self.apply_defaults(args)
+        return make_loaders(args, tokenizer)
+
+    def set_defaults(self, **kwargs):
+        for k, v in kwargs.items():
+            self.defaults[k] = v
+
+    def apply_defaults(self, args):
+        for k, v in self.defaults.items():
+            k = k.replace('-', '_')
+            if not hasattr(args, k):
+                setattr(args, k, v)
+
+
+def prepare_tokenizer(args):
+    add_sentinel_token = 0
+    if args.sentinel_token:
+        add_sentinel_token = args.max_position_embeddings
+    tokenizer = make_tokenizer(
+        args.tokenizer_type,
+        None,
+        args.tokenizer_path,
+        args.vocab_size,
+        args.tokenizer_model_type,
+        add_block_symbols=args.block_lm,
+        cache_dir=args.cache_dir,
+        add_sentinel_token=add_sentinel_token,
+        add_task_mask=args.task_mask,
+        add_decoder_mask=args.block_mask_prob > 0.0
+        or args.context_mask_ratio > 0.0)
+    if mpu.get_model_parallel_rank() == 0:
+        num_tokens = tokenizer.num_tokens
+        eod_token = tokenizer.get_command('eos').Id
+        assert eod_token == tokenizer.get_command('pad').Id
+        before = num_tokens
+        after = before
+        multiple = args.make_vocab_size_divisible_by
+        while (after % multiple) != 0:
+            after += 1
+        print_rank_0('> padded vocab (size: {}) with {} dummy '
+                     'tokens (new size: {})'.format(before, after - before,
+                                                    after))
+        print_rank_0('> found end-of-document token: {}'.format(eod_token))
+        token_counts = torch.cuda.LongTensor([after, eod_token])
+    else:
+        token_counts = torch.cuda.LongTensor([0, 0])
+    # Broadcast num tokens.
+    torch.distributed.broadcast(
+        token_counts,
+        mpu.get_model_parallel_src_rank(),
+        group=mpu.get_model_parallel_group())
+    num_tokens = token_counts[0].item()
+    eod_token = token_counts[1].item()
+    args.vocab_size, args.eod_token = num_tokens, eod_token
+    return tokenizer
+
+
+def make_data_loader(dataset,
+                     tokenizer,
+                     batch_size,
+                     num_iters,
+                     args,
+                     shuffle=False,
+                     block_collate=False):
+    world_size = torch.distributed.get_world_size(
+        group=mpu.get_data_parallel_group())
+    rank = torch.distributed.get_rank(group=mpu.get_data_parallel_group())
+    if args.loader_scatter is not None:
+        rank = rank // args.loader_scatter
+        world_size = world_size // args.loader_scatter
+        batch_size = batch_size // args.loader_scatter
+    distributed = world_size > 1
+    if args.transformer_xl:
+        batch_sampler = data_utils.samplers.DistributedSequentialSampler(
+            len(dataset), num_iters, batch_size, rank, world_size)
+    else:
+        if shuffle:
+            sampler = data_utils.samplers.RandomSampler(
+                dataset,
+                replacement=True,
+                num_samples=batch_size * args.train_iters
+                * args.gradient_accumulation_steps)
+        else:
+            sampler = torch.utils.data.SequentialSampler(dataset)
+        drop_last = distributed
+        # the GPUs in the same model parallel group receive the same data
+        if distributed:
+            batch_sampler = data_utils.samplers.DistributedBatchSampler(
+                sampler,
+                batch_size,
+                drop_last,
+                rank,
+                world_size,
+                gradient_accumulation_steps=args.gradient_accumulation_steps)
+        else:
+            batch_sampler = torch.utils.data.BatchSampler(
+                sampler, batch_size, drop_last)
+    collate_fn = None
+    if block_collate:
+        collate_fn = ConstructBlockStrategy(
+            args,
+            tokenizer,
+            args.seq_length,
+            bert_prob=args.bert_prob,
+            gap_sentence_prob=args.gap_sentence_prob,
+            gap_sentence_ratio=args.gap_sentence_ratio,
+            gpt_infill_prob=args.gpt_infill_prob,
+            average_block_length=args.avg_block_length,
+            gpt_min_ratio=args.gpt_min_ratio,
+            block_mask_prob=args.block_mask_prob,
+            context_mask_ratio=args.context_mask_ratio,
+            short_seq_prob=args.short_seq_prob,
+            single_span_prob=args.single_span_prob,
+            shuffle_blocks=not args.no_shuffle_block,
+            block_position_encoding=not args.no_block_position,
+            sentinel_token=args.sentinel_token,
+            encoder_decoder=args.encoder_decoder,
+            task_mask=args.task_mask,
+            random_position=args.random_position,
+            masked_lm=args.masked_lm).construct_blocks
+    data_loader = torch.utils.data.DataLoader(
+        dataset,
+        batch_sampler=batch_sampler,
+        num_workers=args.num_workers,
+        pin_memory=True,
+        collate_fn=collate_fn)
+
+    return data_loader
+
+
+def make_tfrecord_loaders(args):
+    """Load train/val/test dataset from shuffled TFRecords"""
+
+    import data_utils.tf_dl
+    data_set_args = {
+        'batch_size': args.batch_size,
+        'max_seq_len': args.seq_length,
+        'max_preds_per_seq': args.max_preds_per_seq,
+        'train': True,
+        'num_workers': max(args.num_workers, 1),
+        'seed': args.seed + args.rank + 1,
+        'threaded_dl': args.num_workers > 0
+    }
+    train = data_utils.tf_dl.TFRecordDataLoader(args.train_data,
+                                                **data_set_args)
+    data_set_args['train'] = False
+    if args.eval_seq_length is not None:
+        data_set_args['max_seq_len'] = args.eval_seq_length
+    if args.eval_max_preds_per_seq is not None:
+        data_set_args['max_preds_per_seq'] = args.eval_max_preds_per_seq
+    valid = None
+    if args.valid_data is not None:
+        valid = data_utils.tf_dl.TFRecordDataLoader(args.valid_data,
+                                                    **data_set_args)
+    test = None
+    if args.test_data is not None:
+        test = data_utils.tf_dl.TFRecordDataLoader(args.test_data,
+                                                   **data_set_args)
+    tokenizer = data_utils.make_tokenizer(
+        args.tokenizer_type,
+        train,
+        args.tokenizer_path,
+        args.vocab_size,
+        args.tokenizer_model_type,
+        cache_dir=args.cache_dir)
+
+    return (train, valid, test), tokenizer
+
+
+def make_loaders(args, tokenizer):
+    """makes training/val/test"""
+
+    if args.use_tfrecords:
+        return make_tfrecord_loaders(args)
+    world_size = torch.distributed.get_world_size(
+        group=mpu.get_data_parallel_group())
+    if args.loader_scatter is not None:
+        assert world_size % args.loader_scatter == 0
+    batch_size = args.batch_size * world_size
+    eval_batch_size = batch_size
+    if args.eval_batch_size is not None:
+        eval_batch_size = args.eval_batch_size * world_size
+    seq_length = args.seq_length
+    if seq_length < 0:
+        seq_length = seq_length * world_size
+    eval_seq_length = args.eval_seq_length
+    if eval_seq_length is not None and eval_seq_length < 0:
+        eval_seq_length = eval_seq_length * world_size
+    split = get_split(args)
+    data_set_args = {
+        'path': args.train_data,
+        'seq_length': seq_length,
+        'mem_length': args.mem_length,
+        'delim': args.delim,
+        'text_key': args.text_key,
+        'label_key': 'label',
+        'ds_type': args.data_set_type,
+        'split': split,
+        'loose': args.loose_json,
+        'max_preds_per_seq': args.max_preds_per_seq,
+        'presplit_sentences': args.presplit_sentences,
+        'sample_one_document': args.sample_one_document,
+        'filter_english': args.filter_english,
+        'pre_tokenize': not args.no_pre_tokenize,
+        'tokenizer': tokenizer,
+        'save_splits': args.save_splits,
+        'load_splits': args.load_splits,
+        'save_test_data': args.save_test_data,
+        'no_lazy_loader': args.no_lazy_loader,
+        'loader_scatter': args.loader_scatter,
+        'data_parallel_rank': mpu.get_data_parallel_rank(),
+        'non_sentence_start': args.non_sentence_start,
+        'half_lazy_loader': args.half_lazy_loader
+    }
+
+    eval_set_args = copy.copy(data_set_args)
+    eval_set_args['split'] = [1.]
+    # if optional eval args were set then replace their
+    # equivalent values in the arg dict
+    if eval_seq_length:
+        eval_set_args['seq_length'] = eval_seq_length
+    if args.eval_max_preds_per_seq:
+        eval_set_args['max_preds_per_seq'] = args.eval_max_preds_per_seq
+    if args.eval_text_key is not None:
+        eval_set_args['text_key'] = args.eval_text_key
+
+    # make datasets splits and tokenizer
+    train, valid, test = None, None, None
+
+    if args.train_data is not None:
+        train = data_utils.make_dataset(**data_set_args)
+        if data_utils.should_split(split):
+            train, valid, test = train
+        eval_set_args['tokenizer'] = tokenizer
+
+    # make training and val dataset if necessary
+    if valid is None and args.valid_data is not None:
+        eval_set_args['path'] = args.valid_data
+        valid = data_utils.make_dataset(**eval_set_args)
+        eval_set_args['tokenizer'] = tokenizer
+    if test is None and args.test_data is not None:
+        eval_set_args['path'] = args.test_data
+        test = data_utils.make_dataset(**eval_set_args)
+
+    # wrap datasets with data loader
+    use_block = args.block_lm or args.encoder_decoder
+
+    if train is not None and args.batch_size > 0:
+        train = make_data_loader(
+            train,
+            tokenizer,
+            batch_size,
+            args.train_iters,
+            args,
+            shuffle=args.shuffle,
+            block_collate=use_block)
+        args.do_train = True
+    else:
+        args.do_train = False
+    eval_batch_size = eval_batch_size if eval_batch_size != 0 else batch_size
+    if valid is not None:
+        valid = make_data_loader(
+            valid,
+            tokenizer,
+            eval_batch_size,
+            args.train_iters,
+            args,
+            shuffle=args.shuffle,
+            block_collate=use_block)
+        args.do_valid = True
+    else:
+        args.do_valid = False
+    if test is not None:
+        test = make_data_loader(
+            test,
+            tokenizer,
+            eval_batch_size,
+            len(test) // eval_batch_size + 1,
+            args,
+            shuffle=args.shuffle,
+            block_collate=use_block)
+        args.do_test = True
+    else:
+        args.do_test = False
+
+    return train, valid, test
+
+
+def build_multi_task_dataset(args, tokenizer):
+    task_dirs = {
+        'mnli': 'MNLI',
+        'cola': 'CoLA',
+        'mrpc': 'MRPC',
+        'qnli': 'QNLI',
+        'qqp': 'QQP',
+        'sst2': 'SST-2',
+        'agnews': 'Agnews',
+        'yelp-polarity': 'yelp_review_polarity_csv',
+        'yelp-full': 'yelp_review_full_csv',
+        'yahoo': 'Yahoo',
+        'squad': 'SQuAD',
+        'race': 'RACE'
+    }
+    train, valid = None, None
+    if mpu.get_model_parallel_rank() == 0:
+        multi_seq_length = args.seq_length
+        if args.multi_seq_length is not None:
+            multi_seq_length = args.multi_seq_length
+        train_datasets, valid_datasets = [], []
+        for task in args.multi_task_data:
+            task = task.lower()
+            data_dir = os.path.join(args.data_dir, task_dirs[task])
+            train_datasets.append(
+                SuperGlueDataset(
+                    args,
+                    task,
+                    data_dir,
+                    multi_seq_length,
+                    'train',
+                    tokenizer,
+                    pattern_ensemble=True))
+            valid_datasets.append(
+                SuperGlueDataset(
+                    args,
+                    task,
+                    data_dir,
+                    multi_seq_length,
+                    'dev',
+                    tokenizer,
+                    pattern_ensemble=True))
+        train = MultiTaskDataset(args.multi_task_data, train_datasets)
+        valid = MultiTaskDataset(args.multi_task_data, valid_datasets)
+        world_size = torch.distributed.get_world_size(
+            group=mpu.get_data_parallel_group())
+        multi_batch_size = args.batch_size * world_size
+        if args.multi_batch_size is not None:
+            multi_batch_size = args.multi_batch_size * world_size
+        train = make_data_loader(
+            train,
+            tokenizer,
+            multi_batch_size,
+            args.train_iters,
+            args,
+            shuffle=True)
+        valid = make_data_loader(
+            valid,
+            tokenizer,
+            multi_batch_size,
+            args.train_iters,
+            args,
+            shuffle=True)
+    return train, valid
+
+
+def get_split(args):
+    """
+    Get dataset splits from comma separated string list
+    """
+    splits = []
+    if args.split.find(',') != -1:
+        splits = [float(s) for s in args.split.split(',')]
+    elif args.split.find('/') != -1:
+        splits = [float(s) for s in args.split.split('/')]
+    else:
+        splits = [float(args.split)]
+    split_total = sum(splits)
+    if split_total < 1.:
+        splits.append(1 - split_total)
+    while len(splits) < 3:
+        splits.append(0.)
+    splits = splits[:3]
+    if args.valid_data is not None:
+        splits[1] = 0.
+    if args.test_data is not None:
+        splits[2] = 0.
+    final_sum = sum(splits)
+    return [s / final_sum for s in splits]
+
+
+def configure_data():
+    """add cmdline flags for configuring datasets"""
+    # These are options that are used by data_utils, but are either
+    # deprecated or not meant to be exposed to the command line user.
+    # These options are intneded to be set in code by specific scripts.
+    defaults = {
+        'world_size': 1,
+        'rank': -1,
+        'persist_state': 0,
+        'lazy': False,
+        'transpose': False,
+        'data_set_type': 'supervised',
+        'seq_length': 256,
+        'eval_seq_length': 256,
+        'samples_per_shard': 100
+    }
+
+    return DataConfig(defaults=defaults)
diff --git a/modelscope/models/nlp/mglm/data_utils/__init__.py b/modelscope/models/nlp/mglm/data_utils/__init__.py
new file mode 100644
index 00000000..fa243cb4
--- /dev/null
+++ b/modelscope/models/nlp/mglm/data_utils/__init__.py
@@ -0,0 +1,341 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""utils for creating datasets"""
+import math
+import os
+import random
+import time
+
+import torch
+
+from . import corpora
+from .datasets import (BertSentencepairDataset, BlockDataset, ConcatDataset,
+                       GPT2Dataset, ShuffleDataset, SplitDataset, XLDataset,
+                       split_ds)
+from .lazy_loader import (LazyLoader, LazyWriter, exists_lazy, exists_scatter,
+                          get_scatter_path)
+from .samplers import DistributedBatchSampler
+from .tokenization import (BertWordPieceTokenizer, CharacterLevelTokenizer,
+                           CommandToken, GPT2BPETokenizer, Tokenization,
+                           Tokenizer, make_tokenizer)
+
+TRAIN_DATA = 0
+VAL_DATA = 1
+TEST_DATA = 2
+
+
+def should_split(split):
+    """
+    given split proportions checks if should split
+    Examples:
+    >>> should_split([10,0,0])
+    False
+    >>> should_split([1,.1,.2])
+    True
+    """
+    return max(split) / sum(split) != 1.
+
+
+def get_ext(path):
+    """gets path extension"""
+    return os.path.splitext(path)[1]
+
+
+def get_dataset(name,
+                tokenizer,
+                pre_tokenize,
+                data_parallel_rank,
+                loader_scatter=None,
+                no_lazy_loader=False,
+                half_lazy_loader=False):
+    """gets dataset object based on keyword args and file at `path`"""
+    global_rank = torch.distributed.get_rank()
+    if not supported_corpus(name):
+        raise NotImplementedError('dataset %s is not supported' % name)
+    dataset = corpora.NAMED_CORPORA[name]
+    path = dataset.PATH
+    if issubclass(dataset, corpora.PromptReader):
+        if not (exists_lazy(path, data_type='prompt')
+                and exists_lazy(path, data_type='text')) and not (
+                    loader_scatter is not None and exists_scatter(
+                        path, data_type='prompt', scatter_num=loader_scatter)
+                    and exists_scatter(
+                        path, data_type='text', scatter_num=loader_scatter)):
+            # create cached version of dataset for lazy loading if it doesn't exist
+            if global_rank == 0:
+                print(f'Creating lazy loader for dataset {name}')
+                prompt_writer = LazyWriter(
+                    path, data_type='prompt', is_array=pre_tokenize)
+                text_writer = LazyWriter(
+                    path, data_type='text', is_array=pre_tokenize)
+                writers = {'prompt': prompt_writer, 'text': text_writer}
+                reader = dataset(
+                    writers=writers,
+                    tokenizer=tokenizer,
+                    tokenize=pre_tokenize)
+                reader.process()
+                prompt_writer.close()
+                text_writer.close()
+            else:
+                while not os.path.exists(
+                        LazyWriter.get_len_path(path, data_type='prompt')):
+                    time.sleep(1)
+        map_fn = (lambda x: x.tolist()) if pre_tokenize else None
+        if loader_scatter is not None:
+            if not (exists_scatter(
+                    path, data_type='prompt', scatter_num=loader_scatter)
+                    and exists_scatter(
+                        path, data_type='text', scatter_num=loader_scatter)):
+                if global_rank == 0:
+                    print(f'Creating scatter loader for dataset {name}')
+                    prompts = LazyLoader(
+                        path,
+                        data_type='prompt',
+                        map_fn=map_fn,
+                        mem_map=True,
+                        is_array=pre_tokenize)
+                    texts = LazyLoader(
+                        path,
+                        data_type='text',
+                        map_fn=map_fn,
+                        mem_map=True,
+                        is_array=pre_tokenize)
+                    indices = list(range(len(texts)))
+                    random.shuffle(indices)
+                    segment_length = (len(indices) - 1) // loader_scatter + 1
+                    for i in range(loader_scatter):
+                        scatter_path = get_scatter_path(path, scatter_rank=i)
+                        prompt_writer = LazyWriter(
+                            scatter_path,
+                            data_type='prompt',
+                            is_array=pre_tokenize)
+                        text_writer = LazyWriter(
+                            scatter_path,
+                            data_type='text',
+                            is_array=pre_tokenize)
+                        for idx in indices[i * segment_length:(i + 1)
+                                           * segment_length]:
+                            prompt_writer.write(prompts[idx])
+                            text_writer.write(texts[idx])
+                        prompt_writer.close()
+                        text_writer.close()
+                else:
+                    while not (exists_scatter(
+                            path, data_type='prompt',
+                            scatter_num=loader_scatter) and exists_scatter(
+                                path,
+                                data_type='text',
+                                scatter_num=loader_scatter)):
+                        time.sleep(1)
+            scatter_path = get_scatter_path(
+                path, scatter_rank=data_parallel_rank % loader_scatter)
+            print(f'Rank {global_rank} is using scatter from {scatter_path}')
+            prompts = LazyLoader(
+                scatter_path,
+                data_type='prompt',
+                map_fn=map_fn,
+                mem_map=True,
+                is_array=pre_tokenize,
+                load_memory=no_lazy_loader,
+                half_load=half_lazy_loader)
+            texts = LazyLoader(
+                scatter_path,
+                data_type='text',
+                map_fn=map_fn,
+                mem_map=True,
+                is_array=pre_tokenize,
+                load_memory=no_lazy_loader,
+                half_load=half_lazy_loader)
+        else:
+            prompts = LazyLoader(
+                path,
+                data_type='prompt',
+                map_fn=map_fn,
+                mem_map=True,
+                is_array=pre_tokenize,
+                load_memory=no_lazy_loader,
+                half_load=half_lazy_loader)
+            texts = LazyLoader(
+                path,
+                data_type='text',
+                map_fn=map_fn,
+                mem_map=True,
+                is_array=pre_tokenize,
+                load_memory=no_lazy_loader,
+                half_load=half_lazy_loader)
+        text = corpora.PromptDataset(
+            prompt_loader=prompts,
+            text_loader=texts,
+            tokenizer=tokenizer,
+            to_tokenize=not pre_tokenize)
+        if loader_scatter is None:
+            if global_rank == 0:
+                print(f'Create dataset {name} with {len(text)} documents')
+                for i in range(10):
+                    rand_id = i if i < 5 else random.randrange(len(text))
+                    sample_tokens = text[rand_id]['tokens'][:1024]
+                    print(sample_tokens)
+                    print(tokenizer.DecodeIds(sample_tokens).encode('utf-8'))
+        else:
+            for scatter_id in range(loader_scatter):
+                if data_parallel_rank % loader_scatter == scatter_id and data_parallel_rank // loader_scatter == 0:
+                    print(
+                        f'Create dataset {name} at scatter {scatter_id} with {len(text)} documents'
+                    )
+                    for i in range(10):
+                        sample_tokens = text[i]['tokens'][:1024]
+                        print(sample_tokens)
+                        print(tokenizer.DecodeIds(sample_tokens))
+                torch.distributed.barrier()
+        return text
+    elif issubclass(dataset, corpora.KeyReader):
+        if not (exists_lazy(path, data_type='text')
+                and exists_lazy(path, data_type='mask')):
+            # create cached version of dataset for lazy loading if it doesn't exist
+            if global_rank == 0:
+                text_writer = LazyWriter(
+                    path, data_type='text', is_array=pre_tokenize)
+                mask_writer = LazyWriter(path, data_type='mask', is_array=True)
+                writers = {'mask': mask_writer, 'text': text_writer}
+                dataset(
+                    writers=writers,
+                    tokenizer=tokenizer,
+                    tokenize=pre_tokenize)
+                mask_writer.close()
+                text_writer.close()
+            else:
+                while not os.path.exists(
+                        LazyWriter.get_len_path(path, data_type='mask')):
+                    time.sleep(1)
+        map_fn = (lambda x: x.tolist()) if pre_tokenize else None
+        masks = LazyLoader(
+            path, data_type='mask', map_fn=map_fn, mem_map=True, is_array=True)
+        texts = LazyLoader(
+            path,
+            data_type='text',
+            map_fn=map_fn,
+            mem_map=True,
+            is_array=pre_tokenize)
+        text = corpora.KeyDataset(
+            mask_loader=masks,
+            text_loader=texts,
+            tokenizer=tokenizer,
+            to_tokenize=not pre_tokenize)
+        return text
+
+
+def supported_corpus(corpus_name):
+    """checks if corpus name is defined in `corpora.py`"""
+    return corpus_name in corpora.NAMED_CORPORA
+
+
+def make_dataset(path,
+                 seq_length,
+                 mem_length,
+                 shuffle=True,
+                 split=None,
+                 tokenizer=None,
+                 sample_one_document=False,
+                 pre_tokenize=False,
+                 ds_type='',
+                 save_splits=None,
+                 load_splits=None,
+                 save_test_data=None,
+                 no_lazy_loader=False,
+                 loader_scatter=None,
+                 data_parallel_rank=None,
+                 filter_english=False,
+                 non_sentence_start=0.0,
+                 half_lazy_loader=False,
+                 **kwargs):
+    """function to create datasets+tokenizers for common options"""
+    if split is None:
+        split = [1.]
+
+    # get one or multiple datasets and concatenate
+    if isinstance(path, str):
+        ds = get_dataset(
+            path,
+            tokenizer=tokenizer,
+            pre_tokenize=pre_tokenize,
+            no_lazy_loader=no_lazy_loader,
+            loader_scatter=loader_scatter,
+            data_parallel_rank=data_parallel_rank,
+            half_lazy_loader=half_lazy_loader)
+    else:
+        ds = [
+            get_dataset(
+                p,
+                tokenizer=tokenizer,
+                pre_tokenize=pre_tokenize,
+                no_lazy_loader=no_lazy_loader,
+                loader_scatter=loader_scatter,
+                data_parallel_rank=data_parallel_rank,
+                half_lazy_loader=half_lazy_loader) for p in path
+        ]
+        ds = ConcatDataset(ds)
+
+    # Split dataset into train/val/test (and wrap bert dataset)
+    def wrap_dataset(dataset):
+        if ds_type.lower() == 'bert':
+            presplit_sentences = kwargs[
+                'presplit_sentences'] if 'presplit_sentences' in kwargs else False
+            dataset = BertSentencepairDataset(
+                dataset,
+                max_seq_len=seq_length,
+                presplit_sentences=presplit_sentences)
+        elif ds_type.lower() == 'gpt-xl':
+            assert pre_tokenize
+            dataset = XLDataset(
+                dataset,
+                tokenizer,
+                max_seq_len=seq_length,
+                mem_len=mem_length,
+                sample_across_doc=not sample_one_document)
+        elif ds_type.lower() == 'gpt2':
+            dataset = GPT2Dataset(
+                dataset,
+                tokenizer,
+                max_seq_len=seq_length,
+                sample_across_doc=not sample_one_document)
+        elif ds_type.lower() == 'block':
+            dataset = BlockDataset(
+                dataset,
+                tokenizer,
+                max_seq_len=seq_length,
+                sample_across_doc=not sample_one_document,
+                filter_english=filter_english,
+                non_sentence_start=non_sentence_start)
+        return dataset
+
+    if should_split(split):
+        ds = split_ds(
+            ds,
+            split,
+            shuffle=shuffle,
+            save_splits=save_splits,
+            load_splits=load_splits)
+        if save_test_data is not None and torch.distributed.get_rank() == 0:
+            test_ds = ds[-1]
+            with open(save_test_data, 'w', encoding='utf-8') as output:
+                for data in test_ds:
+                    text = data['tokens']
+                    text = tokenizer.DecodeIds(text)
+                    output.write(text)
+                    output.write('\n')
+            print(f'Write test data to {save_test_data}')
+        ds = [wrap_dataset(d) if d is not None else None for d in ds]
+    else:
+        ds = wrap_dataset(ds)
+    return ds
diff --git a/modelscope/models/nlp/mglm/data_utils/corpora.py b/modelscope/models/nlp/mglm/data_utils/corpora.py
new file mode 100755
index 00000000..7c6f58f8
--- /dev/null
+++ b/modelscope/models/nlp/mglm/data_utils/corpora.py
@@ -0,0 +1,583 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""several datasets with preset arguments"""
+import os
+import random
+from collections import defaultdict
+from multiprocessing import Process, Queue
+from queue import Empty
+
+import json
+import tqdm
+from torch.utils import data
+
+from modelscope.models.nlp.mglm.utils import print_rank_0
+from .datasets import csv_dataset, json_dataset
+from .lazy_loader import LazyLoader
+
+NUM_PROCESSES = 100
+
+
+def punctuation_standardization(string: str):
+    punctuation_dict = {
+        '\u201c': "\"",
+        '\u201d': "\"",
+        '\u2019': "'",
+        '\u2018': "'",
+        '\u2013': '-'
+    }
+    for key, value in punctuation_dict.items():
+        string = string.replace(key, value)
+    return string
+
+
+class KeyDataset(data.Dataset):
+
+    def __init__(self, text_loader, mask_loader, **kwargs):
+        self.texts = text_loader
+        self.masks = mask_loader
+        self.is_lazy = False
+        if isinstance(self.texts, LazyLoader) and isinstance(
+                self.masks, LazyLoader):
+            self.text_lens = self.texts.lens
+            self.is_lazy = True
+
+    def get_text_len(self, idx):
+        return self.text_lens[idx]
+
+    def __getitem__(self, index):
+        text = self.texts[index]
+        mask_length = self.masks[index]
+        mask = []
+        for i, length in enumerate(mask_length):
+            if i % 2 == 0:
+                mask += [0] * length
+            else:
+                mask += [1] * length
+        assert len(text) == len(mask)
+        return {'tokens': text, 'loss_masks': mask}
+
+    def __len__(self):
+        return len(self.texts)
+
+
+class PromptDataset(data.Dataset):
+
+    def __init__(self,
+                 prompt_loader,
+                 text_loader,
+                 tokenizer=None,
+                 to_tokenize=False,
+                 **kwargs):
+        self.prompts = prompt_loader
+        self.texts = text_loader
+        self.tokenizer = tokenizer
+        self.to_tokenize = to_tokenize
+        if isinstance(self.prompts, LazyLoader) and isinstance(
+                self.texts, LazyLoader):
+            self.prompt_lens = self.prompts.lens
+            self.text_lens = self.texts.lens
+            self.is_lazy = True
+
+    def get_text_len(self, idx):
+        return self.prompt_lens[idx] + self.text_lens[idx]
+
+    def __getitem__(self, index):
+        prompt = self.prompts[index]
+        text = self.texts[index]
+        if self.to_tokenize:
+            prompt = self.tokenizer.EncodeAsIds(prompt).tokenization
+            text = self.tokenizer.EncodeAsIds(text).tokenization
+        return {
+            'tokens': prompt + text,
+            'loss_masks': [0] * len(prompt) + [1] * len(text)
+        }
+
+    def __len__(self):
+        return len(self.prompts)
+
+
+class DataReader:
+    PATH = None
+    assert_str = None
+    reserve_punct = False
+    split_row = True
+    TASK_QUEUE_LIMIT = 10000000
+    DONE_QUEUE_LIMIT = 10000000
+
+    def tokenize_worker(self, input, output, info, tokenizer, tokenize):
+        raise NotImplementedError
+
+    def print_info(self, info):
+        pass
+
+    def __init__(self, writers, tokenizer=None, tokenize=False, **kwargs):
+        print(self.PATH)
+        print(self.assert_str)
+        assert os.path.exists(self.PATH), self.assert_str
+        print_rank_0(f'Creating dataset from {self.PATH}')
+        self.tokenizer = tokenizer
+        self.tokenize = tokenize
+        self.writers = writers
+
+    def process(self):
+        if os.path.isdir(self.PATH):
+            paths = [
+                os.path.join(top, name) for top, _, names in os.walk(self.PATH)
+                for name in names
+            ]
+            # paths = [entry.path for entry in os.scandir(self.PATH) if
+            #          not entry.is_dir() and not entry.name.endswith("bz2")]
+        else:
+            paths = [self.PATH]
+        task_queue, done_queue, info_queue = Queue(
+            maxsize=self.TASK_QUEUE_LIMIT), Queue(
+                maxsize=self.DONE_QUEUE_LIMIT), Queue()
+        processes = []
+        for i in range(NUM_PROCESSES):
+            process = Process(
+                target=self.tokenize_worker,
+                args=(task_queue, done_queue, info_queue, self.tokenizer,
+                      self.tokenize))
+            process.start()
+            processes.append(process)
+
+        def read_input_to_queue():
+            for path in paths:
+                print_rank_0(f'Start reading {path}')
+                with open(path) as file:
+                    items = json.load(file)
+                    for item in items:
+                        task_queue.put(item)
+                    # if self.split_row:
+                    #     for row in file:
+                    #         task_queue.put(row)
+                    # else:
+                    #     items = json.load(file)
+                    #     for item in items["RECORDS"]:
+                    #         task_queue.put(item)
+            print_rank_0('Read input complete')
+            for i in range(len(processes)):
+                task_queue.put('STOP')
+
+        process = Process(target=read_input_to_queue)
+        process.start()
+        count = len(processes)
+        progress_bar = tqdm.tqdm()
+        while True:
+            data = done_queue.get()
+            if data == 'COMPLETE':
+                count -= 1
+                if count == 0:
+                    break
+            else:
+                self.write_result(data, self.writers)
+                progress_bar.update()
+        progress_bar.close()
+        self.print_info(info_queue)
+
+    @staticmethod
+    def write_result(data, writers):
+        raise NotImplementedError
+
+    @staticmethod
+    def get_token_count(contents):
+        return sum(map(len, contents))
+
+    @classmethod
+    def process_sample(cls, text, tokenizer, tokenize):
+        if isinstance(text, str) and tokenize:
+            if not cls.reserve_punct:
+                text = punctuation_standardization(text)
+            text = tokenizer.EncodeAsIds(text).tokenization if text else []
+        return text
+
+    @staticmethod
+    def trim_field(content, max_length):
+        if len(content) > max_length:
+            content = content[:max_length]
+            content += '......'
+        return content
+
+    def process_line(self, data, tokenizer, tokenize):
+        raise NotImplementedError
+
+
+class PromptReader(DataReader):
+    is_json = True
+
+    def tokenize_worker(self, input, output, info, tokenizer, tokenize):
+        for row in iter(input.get, 'STOP'):
+            if row:
+                if self.is_json:
+                    row = row.rstrip()
+                    row = json.loads(row)
+                prompts, texts = self.process_line(row, tokenizer, tokenize)
+                for prompt, text in zip(prompts, texts):
+                    output.put((prompt, text))
+        output.put('COMPLETE')
+
+    @staticmethod
+    def write_result(data, writers):
+        prompt, text = data
+        writers['prompt'].write(prompt)
+        writers['text'].write(text)
+
+
+class KeyReader(DataReader):
+    PATH = '/root/data/wikipedia/wiki-key.txt'
+    assert_str = 'make sure to set PATH for wikipedia data_utils/corpora.py'
+
+    def process_line(self, data, tokenizer, tokenize):
+        keys, contents = data['key'], data['content']
+        assert len(keys) == len(contents)
+        for i in range(1, len(keys)):
+            keys[i] = ' ' + keys[i]
+        contents = [' ' + content for content in contents]
+        keys = [tokenizer.EncodeAsIds(key).tokenization for key in keys]
+        contents = [
+            tokenizer.EncodeAsIds(content).tokenization for content in contents
+        ]
+        summary = sum(keys, [])
+        summary_prefix = self.process_sample('Summary: ', tokenizer, tokenize)
+        summary_mask = [len(summary_prefix), len(summary)]
+        summary = summary_prefix + summary
+        text, text_mask = [], []
+        for key, content in zip(keys, contents):
+            content = content + [tokenizer.get_command('eop').Id]
+            text += key
+            text += content
+            text_mask.append(len(key))
+            text_mask.append(len(content))
+        return (summary, summary_mask), (text, text_mask)
+
+    def tokenize_worker(self, input, output, info, tokenizer, tokenize):
+        for row in iter(input.get, 'STOP'):
+            data = json.loads(row)
+            summary, content = self.process_line(data, tokenizer, tokenize)
+            output.put((summary, content))
+        output.put('COMPLETE')
+
+    @staticmethod
+    def write_result(data, writers):
+        summary, content = data
+        writers['text'].write(summary[0])
+        writers['mask'].write(summary[1])
+        writers['text'].write(content[0])
+        writers['mask'].write(content[1])
+
+
+class zhihu(PromptReader):
+    PATH = '/dataset/fd5061f6/data/tokenize_data/zhihu.lazy'
+    reserve_punct = True
+    assert_str = 'make sure to set PATH for zhihu data_utils/corpora.py'
+    qtitle_prefix = '问题：'
+    qcontent_prefix = '问题描述：'
+    user_prefix = '回答用户：'
+    answer_prefix = ' 回答：'
+
+    # qtitle_prefix = []
+    # qcontent_prefix = []
+    # user_prefix = []
+    # answer_prefix = []
+
+    def process_line(self, data, tokenizer, tokenize):
+        prompts, texts = [], []
+        ans_length = len(data.get('ans-content', ''))
+        ans_up = data.get('ans-up-num', '')
+        ans_up = int(ans_up) if ans_up else 0
+        if ans_length > 100 or ans_up > 1000:
+            qtitle = data['q_title']
+            qcontent = data['q-content']
+            if qcontent is None:
+                qcontent = ''
+            qcontent = self.trim_field(qcontent, max_length=100)
+            user = data.get('user-signature', '')
+            prompt = self.qtitle_prefix + qtitle + self.qcontent_prefix + qcontent + self.user_prefix + user + self.answer_prefix  # noqa
+            text = data['ans-content']
+            prompt, text = self.process_sample(prompt, tokenizer,
+                                               tokenize), self.process_sample(
+                                                   text, tokenizer, tokenize)
+            prompts.append(prompt)
+            texts.append(text)
+        # prompt = data["q_title"] + data["q-content"] + data["user-signature"]
+        # text = data["ans-content"]
+        # prompts.append(prompt)
+        # texts.append(text)
+        return prompts, texts
+
+
+class zhidao(PromptReader):
+    PATH = '/root/data/zhidao/zhidao'
+    reserve_punct = True
+    assert_str = 'make sure to set PATH for zhidao data_utils/corpora.py'
+    qtitle_prefix = '问题：'
+    qcontent_prefix = '问题描述：'
+    answer_prefix = '回答：'
+
+    def process_line(self, data, tokenizer, tokenize):
+        if 'title' not in data:
+            return [], []
+        prompts, texts = [], []
+        qtitle = data['title']
+        qcontent = data.get('content', '')
+        qcontent = self.trim_field(qcontent, max_length=100)
+        prompt = self.qtitle_prefix + qtitle + self.qcontent_prefix + qcontent + self.answer_prefix
+        prompt = self.process_sample(prompt, tokenizer, tokenize)
+        if 'best_answer' in data:
+            text = data['best_answer']['content']
+            if len(text) > 10:
+                text = self.process_sample(text, tokenizer, tokenize)
+                prompts.append(prompt)
+                texts.append(text)
+        for answer in data.get('other_answers', []):
+            text = answer['content']
+            if len(text) > 100:
+                text = self.process_sample(text, tokenizer, tokenize)
+                prompts.append(prompt)
+                texts.append(text)
+        return prompts, texts
+
+
+class baike(PromptReader):
+    PATH = '/dataset/fd5061f6/data/tokenize_data/baike.lazy'
+    reserve_punct = True
+    assert_str = 'make sure to set PATH for baike data_utils/corpora.py'
+
+    def process_line(self, data, tokenizer, tokenize):
+        prompts, texts = [], []
+        text = data.get('title', '') + data.get('abstract', '') + data.get(
+            'content', '')
+        if text:
+            p, t = self.process_sample('', tokenizer,
+                                       tokenize), self.process_sample(
+                                           text, tokenizer, tokenize)
+            prompts.append(p)
+            texts.append(t)
+        return prompts, texts
+
+
+class wikipedia(PromptReader):
+    """
+    dataset for wikipedia with arguments configured for convenience
+
+    command line usage: `--train-data wikipedia`
+    """
+    # PATH = '/dataset/data/wiki.txt'
+    PATH = '/root/data/bert_data/wiki.txt'
+    assert_str = 'make sure to set PATH for wikipedia data_utils/corpora.py'
+
+    def process_line(self, data, tokenizer, tokenize):
+        text = data['text']
+        prompt, text = self.process_sample('', tokenizer,
+                                           tokenize), self.process_sample(
+                                               text, tokenizer, tokenize)
+        return [prompt], [text]
+
+
+class TestDataset(PromptReader):
+    PATH = '/root/data/test.json'
+    assert_str = 'make sure to set PATH for wikipedia data_utils/corpora.py'
+
+    def process_line(self, data, tokenizer, tokenize):
+        prompt, text = data['prompt'], data['text']
+        prompt, text = self.process_sample(prompt, tokenizer,
+                                           tokenize), self.process_sample(
+                                               text, tokenizer, tokenize)
+        return [prompt], [text]
+
+
+class OpenWebText(PromptReader):
+    PATH = '/dataset/fd5061f6/english_data/openwebtext2'
+    assert_str = 'make sure to set PATH for openwebtext data_utils/corpora.py'
+
+    def __init__(self, *args, **kwargs):
+        import fasttext
+        super().__init__(*args, **kwargs)
+        self.model = fasttext.load_model(
+            '/dataset/fd5061f6/english_data/lid.176.bin')
+        print_rank_0('Load language detection model')
+
+    def process_line(self, data, tokenizer, tokenize):
+        text = data['text']
+        if len(text) > 100:
+            lang = self.model.predict(text.replace('\n', ''))[0][0]
+            if lang == '__label__en':
+                prompt, text = self.process_sample(
+                    '', tokenizer,
+                    tokenize), self.process_sample(text, tokenizer, tokenize)
+                return [prompt], [text]
+        return [], []
+
+
+class CCNews(PromptReader):
+    PATH = '/mnt/cc_news.json'
+    assert_str = 'make sure to set PATH for cc-news data_utils/corpora.py'
+
+    def process_line(self, data, tokenizer, tokenize):
+        text = ''
+        title = data.get('title', None)
+        description = data.get('description', None)
+        maintext = data.get('maintext', None)
+        if title:
+            text += title.strip() + ' '
+        if description and (not maintext
+                            or not maintext.startswith(description)):
+            text += description.strip() + ' '
+        if maintext:
+            text += maintext
+        if len(text) > 100:
+            prompt, text = self.process_sample('', tokenizer,
+                                               tokenize), self.process_sample(
+                                                   text, tokenizer, tokenize)
+            return [prompt], [text]
+        else:
+            return [], []
+
+
+class BertData(PromptReader):
+    is_json = False
+    PATH = '/dataset/fd5061f6/english_data/wikibook'
+
+    def process_line(self, data, tokenizer, tokenize):
+        if data:
+            prompt, text = '', data
+            prompt, text = self.process_sample(prompt, tokenizer,
+                                               tokenize), self.process_sample(
+                                                   text, tokenizer, tokenize)
+            return [prompt], [text]
+        else:
+            return [], []
+
+
+class Pile(PromptReader):
+    is_json = True
+    PATH = '/mnt/train'
+    filtered_sources = [
+        'Github', 'StackExchange', 'DM Mathematics', 'Ubuntu IRC', 'EuroParl',
+        'YoutubeSubtitles', 'Enron Emails'
+    ]
+    downsample_sources = {'PubMed Central': 0.3, 'ArXiv': 0.3, 'FreeLaw': 0.3}
+
+    def print_info(self, info):
+        total_dict = defaultdict(int)
+        while True:
+            try:
+                source_dict = info.get(block=False)
+                for source, length in source_dict.items():
+                    total_dict[source] += length
+            except Empty:
+                break
+        print_rank_0(total_dict)
+
+    def tokenize_worker(self, input, output, info, tokenizer, tokenize):
+        source_dict = defaultdict(int)
+        for row in iter(input.get, 'STOP'):
+            row = row.rstrip()
+            if row:
+                if self.is_json:
+                    row = json.loads(row)
+                prompts, texts, source = self.process_line(
+                    row, tokenizer, tokenize)
+                length = 0
+                for prompt, text in zip(prompts, texts):
+                    length += len(text)
+                    output.put((prompt, text))
+                if source:
+                    source_dict[source] += length
+        output.put('COMPLETE')
+        info.put(source_dict)
+
+    def process_line(self, data, tokenizer, tokenize):
+        source = data['meta'].get('pile_set_name', None)
+        text = data.get('text', None)
+        if source and text:
+            if source in self.filtered_sources:
+                return [], [], None
+            elif source in self.downsample_sources and random.random(
+            ) > self.downsample_sources[source]:
+                return [], [], None
+            else:
+                prompt, text = self.process_sample(
+                    '', tokenizer,
+                    tokenize), self.process_sample(text, tokenizer, tokenize)
+                return [prompt], [text], source
+        else:
+            return [], [], None
+
+
+class Stories(PromptReader):
+    is_json = True
+    PATH = '/dataset/fd5061f6/english_data/stories_31G.jsonl'
+
+    def process_line(self, data, tokenizer, tokenize):
+        text = data.get('text', None)
+        if text:
+            prompt, text = self.process_sample('', tokenizer,
+                                               tokenize), self.process_sample(
+                                                   text, tokenizer, tokenize)
+            return [prompt], [text]
+        else:
+            return [], []
+
+
+class BertBaseData(BertData):
+    PATH = '/root/data/formatted_one_article_per_line'
+
+
+class BertLargeData(BertData):
+    PATH = '/dataset/c07bd62b/cognitive/zhengxiao/formatted_one_article_per_line_large'
+
+
+class WuDaoCorpus(PromptReader):
+    # PATH = "/dataset/fd5061f6/chinese_data/WuDao"
+    PATH = '/wudao'
+    is_json = False
+    reserve_punct = True
+    split_row = False
+
+    def process_line(self, item, tokenizer, tokenize):
+        prompts, texts = [], []
+        text = ''
+        title = item.get('title', None)
+        content = item.get('content', None)
+        if title:
+            text += title.strip() + ' '
+        if content:
+            text += content
+        if len(text) > 100:
+            prompt, text = self.process_sample('', tokenizer,
+                                               tokenize), self.process_sample(
+                                                   text, tokenizer, tokenize)
+            prompts.append(prompt)
+            texts.append(text)
+        return prompts, texts
+
+
+NAMED_CORPORA = {
+    'wikipedia': wikipedia,
+    'wikipedia-key': KeyReader,
+    'openwebtext': OpenWebText,
+    'zhihu': zhihu,
+    'zhidao': zhidao,
+    'baike': baike,
+    'test': TestDataset,
+    'wikibook': BertData,
+    'bert-base': BertBaseData,
+    'bert-large': BertLargeData,
+    'cc-news': CCNews,
+    'pile': Pile,
+    'stories': Stories,
+    'wudao': WuDaoCorpus
+}
diff --git a/modelscope/models/nlp/mglm/data_utils/datasets.py b/modelscope/models/nlp/mglm/data_utils/datasets.py
new file mode 100644
index 00000000..777b7d43
--- /dev/null
+++ b/modelscope/models/nlp/mglm/data_utils/datasets.py
@@ -0,0 +1,1244 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""dataset objects for jsons, csvs, and BERT datasets"""
+
+import csv
+import math
+import os
+import random
+import time
+from bisect import bisect_right
+from itertools import accumulate
+from operator import itemgetter
+
+import json
+import nltk
+import numpy as np
+import pandas as pd
+import torch
+import tqdm
+from nltk import tokenize
+from torch.utils import data
+
+from modelscope.models.nlp.mglm.utils import print_rank_0
+from .lazy_loader import LazyLoader, exists_lazy
+
+
+class ShuffleDataset(data.Dataset):
+
+    def __init__(self, ds):
+        self.ds = ds
+        self.shuffle_ids = list(range(len(self.ds)))
+        random.shuffle(self.shuffle_ids)
+        self.is_lazy = hasattr(ds, 'is_lazy') and ds.is_lazy
+        if self.is_lazy:
+            self.prompt_lens = [
+                self.ds.prompt_lens[idx] for idx in self.shuffle_ids
+            ]
+            self.text_lens = [
+                self.ds.text_lens[idx] for idx in self.shuffle_ids
+            ]
+
+    def __getitem__(self, idx):
+        return self.ds[self.shuffle_ids[idx]]
+
+    def __len__(self):
+        return len(self.ds)
+
+
+class ConcatDataset(data.Dataset):
+    """
+    Dataset to concatenate multiple datasets.
+    Purpose: useful to assemble different existing datasets, possibly
+    large-scale datasets as the concatenation operation is done in an
+    on-the-fly manner.
+    Arguments:
+        datasets (sequence): List of datasets to be concatenated.
+    """
+
+    @staticmethod
+    def cumsum(sequence):
+        r, s = [], 0
+        for e in sequence:
+            l = len(e)  # noqa
+            r.append(l + s)
+            s += l
+        return r
+
+    def __init__(self, datasets, **kwargs):
+        super(ConcatDataset, self).__init__()
+        assert len(datasets) > 0, 'datasets should not be an empty iterable'
+        self.datasets = list(datasets)
+        self.is_lazy = sum([
+            isinstance(ds, LazyLoader)
+            or (hasattr(ds, 'is_lazy') and ds.is_lazy) for ds in self.datasets
+        ]) == len(self.datasets)
+        self.cumulative_sizes = self.cumsum(self.datasets)
+        self._X = None
+        self._Y = None
+        self._lens = None
+
+    def get_text_len(self, idx):
+        dataset_idx = bisect_right(self.cumulative_sizes, idx)
+        if dataset_idx == 0:
+            sample_idx = idx
+        else:
+            sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
+        return self.datasets[dataset_idx].get_text_len(sample_idx)
+
+    def SetTokenizer(self, tokenizer):
+        for ds in self.datasets:
+            ds.SetTokenizer(tokenizer)
+
+    def GetTokenizer(self):
+        return self.datasets[0].GetTokenizer()
+
+    def __len__(self):
+        return self.cumulative_sizes[-1]
+
+    def __getitem__(self, idx):
+        dataset_idx = bisect_right(self.cumulative_sizes, idx)
+        if dataset_idx == 0:
+            sample_idx = idx
+        else:
+            sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
+        return self.datasets[dataset_idx][sample_idx]
+
+    @property
+    def lens(self):
+        if self._lens is None:
+            self._lens = []
+            if self.is_lazy:
+                for data in self.datasets:  # noqa
+                    self._lens.extend(data.lens)
+            else:
+                for data in self.datasets:  # noqa
+                    self._lens.extend([
+                        len(d['text']) if isinstance(d, dict) else len(d)
+                        for d in data
+                    ])
+        return self._lens
+
+    @property
+    def X(self):
+        if self._X is None:
+            self._X = []
+            for data in self.datasets:  # noqa
+                self._X.extend(data.X)
+        return self._X
+
+    @property
+    def Y(self):
+        if self._Y is None:
+            self._Y = []
+            for data in self.datasets:  # noqa
+                self._Y.extend(list(data.Y))
+            self._Y = np.array(self._Y)
+        return self._Y
+
+
+class SplitDataset(data.Dataset):
+    """
+    Dataset wrapper to access a subset of another dataset.
+    Purpose: useful to index into existing datasets, possibly
+    large-scale datasets as the subindexing operation is done in an
+    on-the-fly manner.
+    Arguments:
+        ds (Dataset or array-like): List of datasets to be subindexed
+        split_inds (1D array-like): List of indices part of subset
+    """
+
+    def __init__(self, ds, split_inds, **kwargs):
+        self.split_inds = list(split_inds)
+        self.wrapped_data = ds
+        self.is_lazy = isinstance(ds, LazyLoader) or (hasattr(ds, 'is_lazy')
+                                                      and ds.is_lazy)
+        self._X = None
+        self._Y = None
+
+    def __len__(self):
+        return len(self.split_inds)
+
+    def get_text_len(self, idx):
+        return self.wrapped_data.get_text_len(self.split_inds[idx])
+
+    def __getitem__(self, index):
+        return self.wrapped_data[self.split_inds[index]]
+
+    def SetTokenizer(self, tokenizer):
+        self.wrapped_data.SetTokenizer(tokenizer)
+
+    def GetTokenizer(self):
+        return self.wrapped_data.GetTokenizer()
+
+    @property
+    def X(self):
+        if self._X is None:
+            self._X = itemgetter(*self.split_inds)(self.wrapped_data.X)
+        return self._X
+
+    @property
+    def Y(self):
+        if self._Y is None:
+            self._Y = np.array(
+                itemgetter(*self.split_inds)(self.wrapped_data.Y))
+        return self._Y
+
+    def __iter__(self):
+        for idx in self.split_inds:
+            yield self.wrapped_data[idx]
+
+
+def split_ds(ds, split=None, shuffle=True, save_splits=None, load_splits=None):
+    """
+    Split a dataset into subsets given proportions of how
+    much to allocate per split. If a split is 0% returns None for that split.
+    Purpose: Useful for creating train/val/test splits
+    Arguments:
+        ds (Dataset or array-like): Data to be split.
+        split (1D array-like): proportions to split `ds`. `sum(splits) != 0`
+        shuffle (boolean): Randomly split dataset. Default: True
+        save_splits: save split indices to file
+        load_splits: load split indices from file
+    """
+    if split is None:
+        split = [.8, .2, .0]
+    split_sum = sum(split)
+    if split_sum == 0:
+        raise Exception('Split cannot sum to 0.')
+    split = np.array(split)
+    split /= split_sum
+    ds_len = len(ds)
+    inds = np.arange(ds_len)
+    if shuffle:
+        rng = np.random.RandomState(1234)
+        rng.shuffle(inds)
+    if load_splits is not None:
+        inds = np.load(load_splits)
+        assert len(inds) == ds_len
+        print_rank_0(f'Load split indices from {load_splits}')
+    elif save_splits is not None:
+        if torch.distributed.get_rank() == 0:
+            np.save(save_splits, inds)
+            print(f'Save split indices to {save_splits}')
+    start_idx = 0
+    residual_idx = 0
+    rtn_ds = [None] * len(split)
+    for i, f in enumerate(split):
+        if f != 0:
+            proportion = ds_len * split[i]
+            residual_idx += proportion % 1
+            split_ = int(int(proportion) + residual_idx)
+            split_inds = inds[start_idx:start_idx + max(split_, 1)]
+            rtn_ds[i] = SplitDataset(ds, split_inds)
+            start_idx += split_
+            residual_idx %= 1
+    return rtn_ds
+
+
+class csv_dataset(data.Dataset):
+    """
+    Class for loading datasets from csv files.
+    Purpose: Useful for loading data for unsupervised modeling or transfer tasks
+    Arguments:
+        path (str): Path to csv file with dataset.
+        tokenizer (data_utils.Tokenizer): Tokenizer to use when processing text. Default: None
+        preprocess_fn (callable): Callable that process a string into desired format.
+        delim (str): delimiter for csv. Default: ','
+        binarize_sent (bool): binarize label values to 0 or 1 if they\'re on a different scale. Default: False
+        drop_unlabeled (bool): drop rows with unlabelled values. Always fills remaining empty
+            columns with -1 (regardless if rows are dropped based on value) Default: False
+        text_key (str): key to get text from csv. Default: 'sentence'
+        label_key (str): key to get label from json dictionary. Default: 'label'
+    Attributes:
+        X (list): all strings from the csv file
+        Y (np.ndarray): labels to train with
+    """
+
+    def __init__(self,
+                 path,
+                 tokenizer=None,
+                 preprocess_fn=None,
+                 delim=',',
+                 binarize_sent=False,
+                 drop_unlabeled=False,
+                 text_key='sentence',
+                 label_key='label',
+                 **kwargs):
+        self.is_lazy = False
+        self.preprocess_fn = preprocess_fn
+        self.SetTokenizer(tokenizer)
+        self.path = path
+        self.delim = delim
+        self.text_key = text_key
+        self.label_key = label_key
+        self.drop_unlabeled = drop_unlabeled
+
+        if '.tsv' in self.path:
+            self.delim = '\t'
+
+        self.X = []
+        self.Y = []
+        try:
+            cols = [text_key]
+            if isinstance(label_key, list):
+                cols += label_key
+            else:
+                cols += [label_key]
+            data = pd.read_csv(
+                self.path, sep=self.delim, usecols=cols, encoding='latin-1')
+        except:  # noqa
+            data = pd.read_csv(
+                self.path,
+                sep=self.delim,
+                usecols=[text_key],
+                encoding='latin-1')
+
+        data = data.dropna(axis=0)
+
+        self.X = data[text_key].values.tolist()
+        try:
+            self.Y = data[label_key].values
+        except Exception as e:  # noqa
+            self.Y = np.ones(len(self.X)) * -1
+
+        if binarize_sent:
+            self.Y = binarize_labels(self.Y, hard=binarize_sent)
+
+    def SetTokenizer(self, tokenizer):
+        if tokenizer is None:
+            self.using_tokenizer = False
+            if not hasattr(self, '_tokenizer'):
+                self._tokenizer = tokenizer
+        else:
+            self.using_tokenizer = True
+            self._tokenizer = tokenizer
+
+    def GetTokenizer(self):
+        return self._tokenizer
+
+    @property
+    def tokenizer(self):
+        if self.using_tokenizer:
+            return self._tokenizer
+        return None
+
+    def __len__(self):
+        return len(self.X)
+
+    def __getitem__(self, index):
+        """process+tokenize string and return string,label,and stringlen"""
+        x = self.X[index]
+        if self.tokenizer is not None:
+            x = self.tokenizer.EncodeAsIds(x, self.preprocess_fn)
+        elif self.preprocess_fn is not None:
+            x = self.preprocess_fn(x)
+        y = self.Y[index]
+        if isinstance(y, str):
+            if self.tokenizer is not None:
+                y = self.tokenizer.EncodeAsIds(y, self.preprocess_fn)
+            elif self.preprocess_fn is not None:
+                y = self.preprocess_fn(y)
+        return {'text': x, 'length': len(x), 'label': y}
+
+    def write(self, writer_gen=None, path=None, skip_header=False):
+        """
+        given a generator of metrics for each of the data points X_i,
+            write the metrics, text, and labels to a csv file
+        """
+        if path is None:
+            path = self.path + '.results'
+        print('generating csv at ' + path)
+        with open(path, 'w') as csvfile:
+            c = csv.writer(csvfile, delimiter=self.delim)
+            if writer_gen is not None:
+                # if first item of generator is a header of what the metrics mean then write header to csv file
+                if not skip_header:
+                    header = (self.label_key, ) + tuple(
+                        next(writer_gen)) + (self.text_key, )
+                    c.writerow(header)
+                for i, row in enumerate(writer_gen):
+                    row = (self.Y[i], ) + tuple(row) + (self.X[i], )
+                    c.writerow(row)
+            else:
+                c.writerow([self.label_key, self.text_key])
+                for row in zip(self.Y, self.X):
+                    c.writerow(row)
+
+
+class json_dataset(data.Dataset):
+    """
+    Class for loading datasets from a json dump.
+    Purpose: Useful for loading data for unsupervised modeling or transfer tasks
+    Arguments:
+        path (str): path to json file with dataset.
+        tokenizer (data_utils.Tokenizer): Tokenizer to use when processing text. Default: None
+        preprocess_fn (callable): callable function that process a string into desired format.
+            Takes string, maxlen=None, encode=None as arguments. Default: process_str
+        text_key (str): key to get text from json dictionary. Default: 'sentence'
+        label_key (str): key to get label from json dictionary. Default: 'label'
+    Attributes:
+        all_strs (list): list of all strings from the dataset
+        all_labels (list): list of all labels from the dataset (if they have it)
+    """
+
+    def __init__(self,
+                 path,
+                 tokenizer=None,
+                 preprocess_fn=None,
+                 binarize_sent=False,
+                 text_key='sentence',
+                 label_key='label',
+                 loose_json=False,
+                 **kwargs):
+        self.is_lazy = False
+        self.preprocess_fn = preprocess_fn
+        self.path = path
+        self.SetTokenizer(tokenizer)
+        self.X = []
+        self.Y = []
+        self.text_key = text_key
+        self.label_key = label_key
+        self.loose_json = loose_json
+
+        for j in self.load_json_stream(self.path):
+            s = j[text_key]
+            self.X.append(s)
+            self.Y.append(j[label_key])
+
+        if binarize_sent:
+            self.Y = binarize_labels(self.Y, hard=binarize_sent)
+
+    def SetTokenizer(self, tokenizer):
+        if tokenizer is None:
+            self.using_tokenizer = False
+            if not hasattr(self, '_tokenizer'):
+                self._tokenizer = tokenizer
+        else:
+            self.using_tokenizer = True
+            self._tokenizer = tokenizer
+
+    def GetTokenizer(self):
+        return self._tokenizer
+
+    @property
+    def tokenizer(self):
+        if self.using_tokenizer:
+            return self._tokenizer
+        return None
+
+    def __getitem__(self, index):
+        """gets the index'th string from the dataset"""
+        x = self.X[index]
+        if self.tokenizer is not None:
+            x = self.tokenizer.EncodeAsIds(x, self.preprocess_fn)
+        elif self.preprocess_fn is not None:
+            x = self.preprocess_fn(x)
+        y = self.Y[index]
+        if isinstance(y, str):
+            if self.tokenizer is not None:
+                y = self.tokenizer.EncodeAsIds(y, self.preprocess_fn)
+            elif self.preprocess_fn is not None:
+                y = self.preprocess_fn(y)
+        return {'text': x, 'length': len(x), 'label': y}
+
+    def __len__(self):
+        return len(self.X)
+
+    def write(self, writer_gen=None, path=None, skip_header=False):
+        """
+        given a generator of metrics for each of the data points X_i,
+            write the metrics, text, and labels to a json file
+        """
+        if path is None:
+            path = self.path + '.results'
+
+        if writer_gen is not None:
+            # if first item of generator is a header of what the metrics mean then write header to csv file
+            def gen_helper():
+                keys = {}
+                keys[0] = self.label_key
+                if not skip_header:
+                    for idx, k in enumerate(tuple(next(writer_gen))):
+                        keys[idx + 1] = k
+                for i, row in enumerate(writer_gen):
+                    if i == 0 and skip_header:
+                        for idx, _ in enumerate(row):
+                            keys[idx + 1] = 'metric_%d' % (idx, )
+                    j = {}
+                    for idx, v in enumerate((self.Y[i], ) + tuple(row)):
+                        k = keys[idx]
+                        j[k] = v
+                    yield j
+        else:
+
+            def gen_helper():
+                for y in self.Y:
+                    j = {}
+                    j[self.label_key] = y
+                    yield j
+
+        def out_stream():
+            for i, j in enumerate(gen_helper()):
+                j[self.text_key] = self.X[i]
+                yield j
+
+        self.save_json_stream(path, out_stream())
+
+    def save_json_stream(self, save_path, json_stream):
+        if self.loose_json:
+            with open(save_path, 'w') as f:
+                for i, j in enumerate(json_stream):
+                    write_string = ''
+                    if i != 0:
+                        write_string = '\n'
+                    write_string += json.dumps(j)
+                    f.write(write_string)
+        else:
+            jsons = [j for j in json_stream]
+            json.dump(jsons, open(save_path, 'w'), separators=(',', ':'))
+
+    def load_json_stream(self, load_path):
+        if not self.loose_json:
+            jsons = json.load(open(load_path, 'r'))
+            generator = iter(jsons)
+        else:
+
+            def gen_helper():
+                with open(load_path, 'r') as f:
+                    for row in f:
+                        yield json.loads(row)
+
+            generator = gen_helper()
+
+        for j in generator:
+            if self.label_key not in j:
+                j[self.label_key] = -1
+            yield j
+
+
+class XLDataset(data.Dataset):
+
+    def __init__(self,
+                 ds,
+                 tokenizer,
+                 max_seq_len=1024,
+                 mem_len=None,
+                 sample_across_doc=True,
+                 **kwargs):
+        self.ds = ds
+        self.tokenizer = tokenizer
+        self.max_seq_len = max_seq_len
+        if mem_len is None:
+            mem_len = max_seq_len
+        self.mem_len = mem_len
+        self.sample_across_doc = sample_across_doc
+        self.indices, self.num_samples = None, None
+        if hasattr(self.ds, 'is_lazy') and self.ds.is_lazy:
+            self.is_lazy = True
+        self.init_indices()
+
+    def init_indices(self):
+        if self.is_lazy:
+            lens = np.array(
+                [self.ds.get_text_len(idx) for idx in range(len(self.ds))])
+        else:
+            lens = np.array([
+                len(d['prompt'])
+                + len(d['text']) if isinstance(d, dict) else len(d)
+                for d in self.ds
+            ])
+        self.indices = list(accumulate(lens))
+        print_rank_0(
+            f'Dataset document count {len(lens)}, token count {self.indices[-1]}'
+        )
+        self.num_samples = self.indices[-1] // self.max_seq_len + 1
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, idx):
+        tokens, targets, loss_mask, attention_mask = self.getidx(idx)
+        tokens = self.pad_seq(tokens)
+        targets = self.pad_seq(targets)
+        loss_mask = self.pad_seq(loss_mask, pad_id=0)
+        return {
+            'text': np.array(tokens),
+            'target': np.array(targets),
+            'loss_mask': np.array(loss_mask),
+            'attention_mask': np.array(attention_mask)
+        }
+
+    def getidx(self, idx):
+        tokens, targets, loss_masks = [], [], []
+        attention_mask = np.concatenate(
+            (np.zeros((self.max_seq_len, self.mem_len), dtype=np.long),
+             np.ones((self.max_seq_len, self.max_seq_len), dtype=np.long)),
+            axis=1)
+        sample_idx = bisect_right(self.indices, idx * self.max_seq_len)
+        last_end = 0 if sample_idx == 0 else self.indices[sample_idx - 1]
+        token_offset = idx * self.max_seq_len - last_end
+        if token_offset != 0:
+            history = min(self.mem_len, token_offset)
+            attention_mask[:,
+                           -self.max_seq_len - history:-self.max_seq_len] = 1
+        count = 0
+        while len(tokens) < self.max_seq_len and sample_idx < len(self.ds):
+            item = self.ds[sample_idx]
+            text, masks = item['tokens'], item['loss_masks']
+            text = text + [self.tokenizer.get_command('eos').Id]
+            end = min(
+                len(text) - 1, token_offset + self.max_seq_len - len(tokens))
+            masks = masks + [1]
+            if count > 0:
+                current = len(tokens)
+                attention_mask[current:, :current + self.mem_len] = 0
+            tokens += text[token_offset:end]
+            targets += text[token_offset + 1:end + 1]
+            loss_masks += masks[token_offset + 1:end + 1]
+            count += 1
+            sample_idx += 1
+            token_offset = 0
+        return tokens, targets, loss_masks, attention_mask
+
+    def pad_seq(self, seq, pad_id=None):
+        total_tokens = self.max_seq_len
+        num_pad_tokens = max(0, total_tokens - len(seq))
+        seq += [
+            self.tokenizer.get_command('pad').Id if pad_id is None else pad_id
+        ] * (
+            num_pad_tokens)
+        return seq
+
+
+class BlockDataset(data.Dataset):
+
+    def __init__(self,
+                 ds,
+                 tokenizer,
+                 max_seq_len=1024,
+                 sample_across_doc=True,
+                 non_sentence_start=0.0,
+                 filter_english=False,
+                 **kwargs):
+        """
+        sentence_start: the stripped article must start with a complete sentence
+        """
+        self.ds = ds
+        self.ds_len = len(self.ds)
+        self.num_samples = 1000 * self.ds_len
+        self.max_seq_len = max_seq_len
+        self.tokenizer = tokenizer
+        self.sample_across_doc = sample_across_doc
+        self.non_sentence_start = non_sentence_start
+        self.filter_english = filter_english
+        self.weighting, self.total_len = None, None
+        self.is_lazy = False
+        if self.filter_english:
+            import fasttext
+            self.model = fasttext.load_model('/mnt/lid.176.bin')
+            print_rank_0('Load language detection model')
+        if hasattr(self.ds, 'is_lazy') and self.ds.is_lazy:
+            self.is_lazy = True
+        self.init_weighting()
+
+    def init_weighting(self):
+        if self.is_lazy:
+            lens = np.array(
+                [self.ds.get_text_len(idx) for idx in range(len(self.ds))])
+        else:
+            lens = np.array([
+                len(d['text']) if isinstance(d, dict) else len(d)
+                for d in self.ds
+            ])
+        self.total_len = np.sum(lens)
+        print_rank_0(
+            f'Dataset document count {len(lens)}, token count {self.total_len}, non sentence start{self.non_sentence_start}'  # noqa
+        )
+        self.weighting = list(accumulate(lens))
+
+    def get_weighted_samples(self, np_rng):
+        while True:
+            idx = np_rng.randint(self.total_len)
+            data_idx = bisect_right(self.weighting, idx)
+            tokens, loss_mask = self.getidx(data_idx)
+            if self.filter_english:
+                text = self.tokenizer.DecodeIds(tokens[:1024])
+                lang = self.model.predict(text.replace('\n', ''))[0][0]
+                if lang == '__label__en':
+                    break
+            else:
+                break
+        return tokens, loss_mask
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, idx):
+        # init rng
+        rng = random.Random(idx)
+        rng = np.random.RandomState(
+            seed=[rng.randint(0, 2**32 - 1) for _ in range(16)])
+
+        # get possibly weighted random index from dataset
+        tokens, loss_mask = self.get_weighted_samples(rng)
+        # truncate or pad tokens
+        num_tokens = len(tokens)
+        tokens_to_strip = num_tokens - self.max_seq_len + 1
+
+        # randomly choose a position for start
+        if tokens_to_strip > 0:
+            move_count = 0
+            strip_left_tokens = rng.randint(tokens_to_strip)
+            if rng.random() > self.non_sentence_start:
+                if rng.random() < 0.5:
+                    while move_count < self.max_seq_len // 2 and strip_left_tokens > 0 and not self.contains_sentence_end(  # noqa
+                            tokens[strip_left_tokens - 1]):  # noqa
+                        strip_left_tokens -= 1
+                        move_count += 1
+                else:
+                    while move_count < self.max_seq_len // 2 and strip_left_tokens < len(
+                            tokens) and not self.contains_sentence_end(
+                                tokens[strip_left_tokens - 1]):
+                        strip_left_tokens += 1
+                        move_count += 1
+            tokens = [self.tokenizer.get_command('ENC').Id
+                      ] + tokens[strip_left_tokens:]
+            loss_mask = [0] + loss_mask[strip_left_tokens:]
+            if len(tokens) == 2 and tokens[1] == self.tokenizer.get_command(
+                    'eos').Id:
+                tokens, loss_mask = [], []
+            tokens, loss_mask = self.right_strip_seq(tokens, loss_mask,
+                                                     self.max_seq_len)
+        else:
+            tokens = [self.tokenizer.get_command('ENC').Id] + tokens
+            loss_mask = [0] + loss_mask
+            # Sample multiple documents
+            if self.sample_across_doc:
+                while len(tokens) < self.max_seq_len:
+                    new_tokens, new_loss_mask = self.get_weighted_samples(rng)
+                    new_tokens = [self.tokenizer.get_command('ENC').Id
+                                  ] + new_tokens
+                    new_loss_mask = [0] + new_loss_mask
+                    is_last = len(new_tokens) >= self.max_seq_len - len(tokens)
+                    new_tokens, new_loss_mask = self.right_strip_seq(
+                        new_tokens, new_loss_mask,
+                        self.max_seq_len - len(tokens))
+                    tokens += new_tokens
+                    loss_mask += new_loss_mask
+                    if is_last:
+                        break
+        return {'text': np.array(tokens), 'loss_mask': np.array(loss_mask)}
+
+    def right_strip_seq(self, tokens, loss_mask, seq_length):
+        strip_right_tokens = len(tokens) - seq_length
+        if strip_right_tokens > 0:
+            while strip_right_tokens < len(
+                    tokens) - 1 and not self.contains_sentence_end(
+                        tokens[-strip_right_tokens - 1]):
+                strip_right_tokens += 1
+            if len(tokens) - strip_right_tokens < seq_length // 2:
+                strip_right_tokens = len(tokens) - seq_length
+            tokens = tokens[:-strip_right_tokens]
+            loss_mask = loss_mask[:-strip_right_tokens]
+        return tokens, loss_mask
+
+    def getidx(self, data_idx):
+        data = self.ds[data_idx]
+        tokens, loss_masks = data['tokens'], data['loss_masks']
+        tokens = tokens + [self.tokenizer.get_command('eos').Id]
+        loss_masks = loss_masks + [1]
+        return tokens, loss_masks
+
+    def pad_seq(self, seq, pad_id=None):
+        total_tokens = self.max_seq_len
+        num_pad_tokens = max(0, total_tokens - len(seq))
+        seq += [
+            self.tokenizer.get_command('pad').Id if pad_id is None else pad_id
+        ] * (
+            num_pad_tokens)
+        return seq
+
+    # TODO: rewrite this function for chinese
+    def contains_sentence_end(self, tok):
+        tok = self.tokenizer.IdToToken(tok)
+        if '.' in tok:
+            return True
+        if '?' in tok:
+            return True
+        if '!' in tok:
+            return True
+        if ';' in tok:
+            return True
+        if ':' in tok:
+            return True
+        if '\n' in tok:
+            return True
+        return False
+
+
+class GPT2Dataset(data.Dataset):
+
+    def __init__(self,
+                 ds,
+                 tokenizer,
+                 max_seq_len=1024,
+                 num_samples=None,
+                 weighted=True,
+                 sample_across_doc=True,
+                 random_across_doc_sampling=True,
+                 sentence_start=False,
+                 **kwargs):
+        """
+        sentence_start: the stripped article must start with a complete sentence
+        """
+        self.ds = ds
+        self.ds_len = len(self.ds)
+        self.num_samples = num_samples
+        if num_samples is None:
+            self.num_samples = 1000 * self.ds_len
+        self.max_seq_len = max_seq_len
+        self.tokenizer = tokenizer
+        self.weighted = weighted
+        self.sample_across_doc = sample_across_doc
+        self.random_across_doc_sampling = random_across_doc_sampling
+        self.sentence_start = sentence_start
+        self.weighting, self.total_len = None, None
+        self.is_lazy = False
+        if hasattr(self.ds, 'is_lazy') and self.ds.is_lazy:
+            self.is_lazy = True
+        self.init_weighting()
+
+    def init_weighting(self):
+        if self.weighted:
+            if self.is_lazy:
+                lens = np.array(
+                    [self.ds.get_text_len(idx) for idx in range(len(self.ds))])
+            else:
+                lens = np.array([
+                    len(d['text']) if isinstance(d, dict) else len(d)
+                    for d in self.ds
+                ])
+            self.total_len = np.sum(lens)
+            print_rank_0(
+                f'Dataset document count {len(lens)}, token count {self.total_len}'
+            )
+            self.weighting = list(accumulate(lens))
+        else:
+            self.weighting = None
+
+    def get_weighted_samples(self, np_rng):
+        if self.weighting is not None:
+            idx = np_rng.randint(self.total_len)
+            return bisect_right(self.weighting, idx)
+        else:
+            return np_rng.randint(self.ds_len)
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, idx):
+        # init rng
+        rng = random.Random(idx)
+        rng = np.random.RandomState(
+            seed=[rng.randint(0, 2**32 - 1) for _ in range(16)])
+
+        # get possibly weighted random index from dataset
+        data_idx = self.get_weighted_samples(rng)
+        #        data_idx = rng.choice(self.ds_len, p=self.weighting)
+        tokens, loss_mask = self.getidx(data_idx)
+
+        # truncate or pad tokens
+        num_tokens = len(tokens)
+        tokens_to_strip = num_tokens - self.max_seq_len - 1
+
+        # randomly choose a position for start
+        if tokens_to_strip > 0:
+            strip_left_tokens = rng.randint(tokens_to_strip + 1)
+            tokens = tokens[strip_left_tokens:]
+            loss_mask = loss_mask[strip_left_tokens:]
+            # if self.sentence_start:
+            #     token_copy = list(tokens)
+            #     not_done = True
+            #     while (len(token_copy) > 0) and not_done:
+            #         tok = token_copy.pop(0)
+            #         if self.contains_sentence_end(tok):
+            #             tokens = token_copy
+            #             not_done = False
+            strip_right_rokens = len(tokens) - self.max_seq_len - 1
+            if strip_right_rokens > 0:
+                tokens = tokens[:-strip_right_rokens]
+                loss_mask = loss_mask[:-strip_right_rokens]
+        # Sample multiple documents
+        if self.sample_across_doc:
+            while (len(tokens) < (self.max_seq_len + 1)):
+                if self.random_across_doc_sampling:
+                    data_idx = self.get_weighted_samples(rng)
+                else:
+                    data_idx = (data_idx + 1) % self.ds_len
+                new_tokens, new_loss_mask = self.getidx(data_idx)
+                tokens += new_tokens
+                loss_mask += new_loss_mask
+            tokens = tokens[:(self.max_seq_len + 1)]
+            loss_mask = loss_mask[:(self.max_seq_len + 1)]
+
+        tokens = self.pad_seq(tokens)
+        loss_mask = self.pad_seq(loss_mask, pad_id=0)
+        return {'text': np.array(tokens), 'loss_mask': np.array(loss_mask)}
+
+    def getidx(self, data_idx):
+        data = self.ds[data_idx]
+        tokens, loss_masks = data['tokens'], data['loss_masks']
+        tokens = tokens + [self.tokenizer.get_command('eos').Id]
+        loss_masks = loss_masks + [1]
+        return tokens, loss_masks
+
+    def pad_seq(self, seq, pad_id=None):
+        total_tokens = self.max_seq_len + 1
+        num_pad_tokens = max(0, total_tokens - len(seq))
+        seq += [
+            self.tokenizer.get_command('pad').Id if pad_id is None else pad_id
+        ] * (
+            num_pad_tokens)
+        return seq
+
+    # TODO: rewrite this function for chinese
+    def contains_sentence_end(self, tok):
+        tok = self.tokenizer.IdToToken(tok)
+        if '.' in tok:
+            return True
+        if '?' in tok:
+            return True
+        if '!' in tok:
+            return True
+        return False
+
+
+class BertSentencepairDataset(data.Dataset):
+    """
+    Dataset containing sentencepairs for BERT training. Each index corresponds to a randomly generated sentence pair.
+    Arguments:
+        ds (Dataset or array-like): data corpus to use for training
+        max_seq_len (int): maximum sequence length to use for a sentence pair
+        mask_lm_prob (float): proportion of tokens to mask for masked LM
+        max_preds_per_seq (int): Maximum number of masked tokens per sentence pair. Default: math.ceil(max_seq_len*mask_lm_prob/10)*10
+        short_seq_prob (float): Proportion of sentence pairs purposefully shorter than max_seq_len
+        dataset_size (int): number of random sentencepairs in the dataset. Default: len(ds)*(len(ds)-1)
+
+    """ # noqa
+
+    def __init__(self,
+                 ds,
+                 max_seq_len=512,
+                 mask_lm_prob=.15,
+                 max_preds_per_seq=None,
+                 short_seq_prob=.01,
+                 dataset_size=None,
+                 presplit_sentences=False,
+                 weighted=True,
+                 **kwargs):
+        self.ds = ds
+        self.ds_len = len(self.ds)
+        self.tokenizer = self.ds.GetTokenizer()
+        self.vocab_words = list(self.tokenizer.text_token_vocab.values())
+        self.ds.SetTokenizer(None)
+        self.max_seq_len = max_seq_len
+        self.mask_lm_prob = mask_lm_prob
+        if max_preds_per_seq is None:
+            max_preds_per_seq = math.ceil(max_seq_len * mask_lm_prob / 10) * 10
+        self.max_preds_per_seq = max_preds_per_seq
+        self.short_seq_prob = short_seq_prob
+        self.dataset_size = dataset_size
+        if self.dataset_size is None:
+            self.dataset_size = self.ds_len * (self.ds_len - 1)
+        self.presplit_sentences = presplit_sentences
+        if not self.presplit_sentences:
+            nltk.download('punkt', download_dir='./nltk')
+        self.weighted = weighted
+        self.get_weighting()
+
+    def get_weighting(self):
+        if self.weighted:
+            if hasattr(self.ds, 'is_lazy') and self.ds.is_lazy:
+                lens = np.array(self.ds.lens)
+            else:
+                lens = np.array([
+                    len(d['text']) if isinstance(d, dict) else len(d)
+                    for d in self.ds
+                ])
+            self.total_len = np.sum(lens)
+            self.weighting = list(accumulate(lens))
+        else:
+            self.weighting = None
+
+    def get_weighted_samples(self, np_rng):
+        if self.weighting is not None:
+            idx = np_rng.randint(self.total_len)
+            return bisect_right(self.weighting, idx)
+        else:
+            return np_rng.randint(self.ds_len)
+
+    def __len__(self):
+        return self.dataset_size
+
+    def __getitem__(self, idx):
+        # get rng state corresponding to index (allows deterministic random pair)
+        rng = random.Random(idx)
+        np_rng = np.random.RandomState(
+            seed=[rng.randint(0, 2**32 - 1) for _ in range(16)])
+        # get seq length
+        target_seq_length = self.max_seq_len
+        short_seq = False  # noqa
+        if rng.random() < self.short_seq_prob:
+            target_seq_length = rng.randint(2, target_seq_length)
+            short_seq = True  # noqa
+
+        # get sentence pair and label
+        is_random_next = None
+        lena = 0
+        lenb = 0
+        while (is_random_next is None) or (lena < 1) or (lenb < 1):
+            tokensa, tokensb, is_random_next = self.create_random_sentencepair(
+                target_seq_length, rng, np_rng)
+            lena = len(tokensa[0])
+            lenb = len(tokensb[0])
+
+        # truncate sentence pair to max_seq_len
+        tokensa, tokensb = self.truncate_seq_pair(tokensa, tokensb,
+                                                  self.max_seq_len, rng)
+        # join sentence pair, mask, and pad
+        tokens, mask, mask_labels, pad_mask = self.create_masked_lm_predictions(
+            tokensa, tokensb, self.mask_lm_prob, self.max_preds_per_seq,
+            self.vocab_words, rng)
+        sample = {
+            'text': np.array(tokens[0]),
+            'types': np.array(tokens[1]),
+            'is_random': int(is_random_next),
+            'mask': np.array(mask),
+            'mask_labels': np.array(mask_labels),
+            'pad_mask': np.array(pad_mask)
+        }
+        return sample
+
+    def sentence_split(self, document):
+        """split document into sentences"""
+        lines = document.split('\n')
+        if self.presplit_sentences:
+            return [line for line in lines if line]
+        rtn = []
+        for line in lines:
+            if line != '':
+                rtn.extend(tokenize.sent_tokenize(line))
+        return rtn
+
+    def sentence_tokenize(self,
+                          sent,
+                          sentence_num=0,
+                          beginning=False,
+                          ending=False):
+        """tokenize sentence and get token types"""
+        tokens = self.tokenizer.EncodeAsIds(sent).tokenization
+        str_type = 'str' + str(sentence_num)
+        token_types = [self.tokenizer.get_type(str_type).Id] * len(tokens)
+        return tokens, token_types
+
+    def get_doc(self, idx):
+        """gets text of document corresponding to idx"""
+        rtn = self.ds[idx]
+        if isinstance(rtn, dict):
+            rtn = rtn['text']
+        return rtn
+
+    def create_random_sentencepair(self, target_seq_length, rng, np_rng):
+        """
+        fetches a random sentencepair corresponding to rng state similar to
+        https://github.com/google-research/bert/blob/master/create_pretraining_data.py#L248-L294
+        """
+        is_random_next = None
+
+        curr_strs = []
+        curr_str_types = []
+        curr_len = 0
+
+        while curr_len < 1:
+            curr_len = 0
+            doc_a = None
+            while doc_a is None:
+                if self.weighted:
+                    # doc_a_idx = np_rng.choice(self.ds_len, p=self.weighting)
+                    doc_a_idx = self.get_weighted_samples(np_rng)
+                else:
+                    doc_a_idx = rng.randint(0, self.ds_len - 1)
+                doc_a = self.sentence_split(self.get_doc(doc_a_idx))
+                if not doc_a:
+                    doc_a = None
+
+            random_start_a = rng.randint(0, len(doc_a) - 1)
+            while random_start_a < len(doc_a):
+                sentence = doc_a[random_start_a]
+                sentence, sentence_types = self.sentence_tokenize(
+                    sentence, 0, random_start_a == 0,
+                    random_start_a == len(doc_a))
+                curr_strs.append(sentence)
+                curr_str_types.append(sentence_types)
+                curr_len += len(sentence)
+                if random_start_a == len(
+                        doc_a) - 1 or curr_len >= target_seq_length:
+                    break
+                random_start_a = (random_start_a + 1)
+
+        if curr_strs:
+            num_a = 1
+            if len(curr_strs) >= 2:
+                num_a = rng.randint(0, len(curr_strs))
+
+            tokens_a = []
+            token_types_a = []
+            for j in range(num_a):
+                tokens_a.extend(curr_strs[j])
+                token_types_a.extend(curr_str_types[j])
+
+            tokens_b = []
+            token_types_b = []
+            is_random_next = False
+            if len(curr_strs) == 1 or rng.random() < 0.5:
+                is_random_next = True
+                target_b_length = target_seq_length - len(tokens_a)
+                b_len = 0
+                while b_len < 1:
+                    doc_b = None
+                    while doc_b is None:
+                        doc_b_idx = rng.randint(0, self.ds_len - 2)
+                        doc_b_idx += int(doc_b_idx >= doc_a_idx)
+
+                        doc_b = self.sentence_split(self.get_doc(doc_b_idx))
+                        if not doc_b:
+                            doc_b = None
+
+                    random_start_b = rng.randint(0, len(doc_b) - 1)
+                    while random_start_b < len(doc_b):
+                        sentence_b = doc_b[random_start_b]
+                        new_b_tokens, new_b_types = self.sentence_tokenize(
+                            sentence_b, 1, random_start_b == 0,
+                            random_start_b == len(doc_b))
+                        b_len += len(new_b_tokens)
+                        tokens_b.extend(new_b_tokens)
+                        token_types_b.extend(new_b_types)
+                        if len(tokens_b) >= target_b_length:
+                            break
+                        random_start_b = (random_start_b + 1)
+            else:
+                is_random_next = False
+                for j in range(num_a, len(curr_strs)):
+                    tokens_b.extend(curr_strs[j])
+                    token_types_b.extend(curr_str_types[j])
+
+        return (tokens_a, token_types_a), (tokens_b,
+                                           token_types_b), is_random_next
+
+    def truncate_seq_pair(self, a, b, max_seq_len, rng):
+        """
+        Truncate sequence pair according to original BERT implementation:
+        https://github.com/google-research/bert/blob/master/create_pretraining_data.py#L391
+        """
+        tokens_a, token_types_a = a
+        tokens_b, token_types_b = b
+        max_num_tokens = max_seq_len - 3
+        while True:
+            len_a = len(tokens_a)
+            len_b = len(tokens_b)
+            total_length = len_a + len_b
+            if total_length <= max_num_tokens:
+                break
+            if len(tokens_a) > len(tokens_b):
+                trunc_tokens = tokens_a
+                trunc_types = token_types_a
+            else:
+                trunc_tokens = tokens_b
+                trunc_types = token_types_b
+
+            assert len(trunc_tokens) >= 1
+
+            if rng.random() < 0.5:
+                trunc_tokens.pop(0)
+                trunc_types.pop(0)
+            else:
+                trunc_tokens.pop()
+                trunc_types.pop()
+        return (tokens_a, token_types_a), (tokens_b, token_types_b)
+
+    def mask_token(self, idx, tokens, types, vocab_words, rng):
+        """
+        helper function to mask `idx` token from `tokens` according to
+        section 3.3.1 of https://arxiv.org/pdf/1810.04805.pdf
+        """
+        label = tokens[idx]
+        if rng.random() < 0.8:
+            new_label = self.tokenizer.get_command('MASK').Id
+        else:
+            if rng.random() < 0.5:
+                new_label = label
+            else:
+                new_label = rng.choice(vocab_words)
+
+        tokens[idx] = new_label
+
+        return label
+
+    def pad_seq(self, seq):
+        """helper function to pad sequence pair"""
+        num_pad = max(0, self.max_seq_len - len(seq))
+        pad_mask = [0] * len(seq) + [1] * num_pad
+        seq += [self.tokenizer.get_command('pad').Id] * num_pad
+        return seq, pad_mask
+
+    def create_masked_lm_predictions(self, a, b, mask_lm_prob,
+                                     max_preds_per_seq, vocab_words, rng):
+        """
+        Mask sequence pair for BERT training according to:
+        https://github.com/google-research/bert/blob/master/create_pretraining_data.py#L338
+        """
+        tokens_a, token_types_a = a
+        tokens_b, token_types_b = b
+        tokens = [self.tokenizer.get_command('ENC').Id] + tokens_a + [
+            self.tokenizer.get_command('sep').Id
+        ] + tokens_b + [self.tokenizer.get_command('sep').Id]
+        token_types = [token_types_a[0]] + token_types_a + [
+            token_types_a[0]
+        ] + token_types_b + [token_types_b[0]]
+
+        len_a = len(tokens_a)
+        len_b = len(tokens_b)
+
+        cand_indices = [idx + 1 for idx in range(len_a)
+                        ] + [idx + 2 + len_a for idx in range(len_b)]
+
+        rng.shuffle(cand_indices)
+
+        output_tokens, pad_mask = self.pad_seq(list(tokens))
+        output_types, _ = self.pad_seq(list(token_types))
+
+        num_to_predict = min(max_preds_per_seq,
+                             max(1, int(round(len(tokens) * mask_lm_prob))))
+
+        mask = [0] * len(output_tokens)
+        mask_labels = [-1] * len(output_tokens)
+
+        for idx in sorted(cand_indices[:num_to_predict]):
+            mask[idx] = 1
+            label = self.mask_token(idx, output_tokens, output_types,
+                                    vocab_words, rng)
+            mask_labels[idx] = label
+
+        return (output_tokens, output_types), mask, mask_labels, pad_mask
diff --git a/modelscope/models/nlp/mglm/data_utils/extraction.py b/modelscope/models/nlp/mglm/data_utils/extraction.py
new file mode 100644
index 00000000..53027e4f
--- /dev/null
+++ b/modelscope/models/nlp/mglm/data_utils/extraction.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2022 Zhipu.AI
+
+import glob
+import os
+
+import json
+import nltk
+
+nltk.download('punkt')
+
+
+class NLTKSegmenter:
+
+    def __init(self):
+        pass
+
+    @staticmethod
+    def segment_string(article):
+        return nltk.tokenize.sent_tokenize(article)
+
+
+wiki_path = 'data/extracted'
+output_path = 'formatted/wiki-key.txt'
+segmenter = NLTKSegmenter()
+with open(output_path, 'w') as output:
+    for dirname in glob.glob(os.path.join(wiki_path, '*'), recursive=False):
+        for filename in glob.glob(
+                os.path.join(dirname, 'wiki_*'), recursive=True):
+            print(filename)
+            article_lines = []
+            article_open = False
+            with open(filename, mode='r', newline='\n') as file:
+                for line in file:
+                    line = line.rstrip()
+                    if '<doc id=' in line:
+                        article_open = True
+                    elif '</doc>' in line:
+                        key_sentences, contents = [], []
+                        key, content = None, []
+                        for sentences in article_lines[1:]:
+                            if len(sentences) > 1:
+                                if key:
+                                    if len(content) > 0 or len(contents) == 0:
+                                        key_sentences.append(key)
+                                        contents.append(content)
+                                    else:
+                                        contents[-1].append(key)
+                                    key, content = None, []
+                                key_sentences.append(sentences[0])
+                                contents.append(sentences[1:])
+                            elif len(sentences) > 0:
+                                if key:
+                                    content.append(sentences[0])
+                                else:
+                                    key = sentences[0]
+                        if key:
+                            if len(content) > 0 or len(contents) == 0:
+                                key_sentences.append(key)
+                                contents.append(content)
+                            else:
+                                contents[-1].append(key)
+                        contents = [' '.join(content) for content in contents]
+                        article = {'key': key_sentences, 'content': contents}
+                        output.write(json.dumps(article))
+                        output.write('\n')
+                        article_open = False
+                        article_lines = []
+                    else:
+                        if article_open and line:
+                            sentences = segmenter.segment_string(line)
+                            article_lines.append(sentences)
diff --git a/modelscope/models/nlp/mglm/data_utils/file_utils.py b/modelscope/models/nlp/mglm/data_utils/file_utils.py
new file mode 100755
index 00000000..794e127a
--- /dev/null
+++ b/modelscope/models/nlp/mglm/data_utils/file_utils.py
@@ -0,0 +1,256 @@
+# Modified by Zhipu.AI
+# This file is provided as is from:
+#   https://github.com/huggingface/pytorch-pretrained-BERT
+# Please refer to their repository for copyright.
+"""
+Utilities for working with the local dataset cache.
+This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
+Copyright by the AllenNLP authors.
+"""
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+import logging
+import os
+import shutil
+import sys
+import tempfile
+from functools import wraps
+from hashlib import sha256
+from io import open
+from urllib.parse import urlparse
+
+import boto3
+import json
+import requests
+from botocore.exceptions import ClientError
+from tqdm import tqdm
+
+try:
+    from pathlib import Path
+    PYTORCH_PRETRAINED_BERT_CACHE = Path(
+        os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
+                  Path.home() / '.pytorch_pretrained_bert'))
+except (AttributeError, ImportError):
+    PYTORCH_PRETRAINED_BERT_CACHE = os.getenv(
+        'PYTORCH_PRETRAINED_BERT_CACHE',
+        os.path.join(os.path.expanduser('~'), '.pytorch_pretrained_bert'))
+
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+
+
+def url_to_filename(url, etag=None):
+    """
+    Convert `url` into a hashed filename in a repeatable way.
+    If `etag` is specified, append its hash to the url's, delimited
+    by a period.
+    """
+    url_bytes = url.encode('utf-8')
+    url_hash = sha256(url_bytes)
+    filename = url_hash.hexdigest()
+
+    if etag:
+        etag_bytes = etag.encode('utf-8')
+        etag_hash = sha256(etag_bytes)
+        filename += '.' + etag_hash.hexdigest()
+
+    return filename
+
+
+def filename_to_url(filename, cache_dir=None):
+    """
+    Return the url and etag (which may be ``None``) stored for `filename`.
+    Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
+    """
+    if cache_dir is None:
+        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
+    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+
+    cache_path = os.path.join(cache_dir, filename)
+    if not os.path.exists(cache_path):
+        raise EnvironmentError('file {} not found'.format(cache_path))
+
+    meta_path = cache_path + '.json'
+    if not os.path.exists(meta_path):
+        raise EnvironmentError('file {} not found'.format(meta_path))
+
+    with open(meta_path, encoding='utf-8') as meta_file:
+        metadata = json.load(meta_file)
+    url = metadata['url']
+    etag = metadata['etag']
+
+    return url, etag
+
+
+def cached_path(url_or_filename, cache_dir=None):
+    """
+    Given something that might be a URL (or might be a local path),
+    determine which. If it's a URL, download the file and cache it, and
+    return the path to the cached file. If it's already a local path,
+    make sure the file exists and then return the path.
+    """
+    if cache_dir is None:
+        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
+    if sys.version_info[0] == 3 and isinstance(url_or_filename, Path):
+        url_or_filename = str(url_or_filename)
+    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+
+    parsed = urlparse(url_or_filename)
+
+    if parsed.scheme in ('http', 'https', 's3'):
+        # URL, so get it from the cache (downloading if necessary)
+        return get_from_cache(url_or_filename, cache_dir)
+    elif os.path.exists(url_or_filename):
+        # File, and it exists.
+        return url_or_filename
+    elif parsed.scheme == '':
+        # File, but it doesn't exist.
+        raise EnvironmentError('file {} not found'.format(url_or_filename))
+    else:
+        # Something unknown
+        raise ValueError(
+            'unable to parse {} as a URL or as a local path'.format(
+                url_or_filename))
+
+
+def split_s3_path(url):
+    """Split a full s3 path into the bucket name and path."""
+    parsed = urlparse(url)
+    if not parsed.netloc or not parsed.path:
+        raise ValueError('bad s3 path {}'.format(url))
+    bucket_name = parsed.netloc
+    s3_path = parsed.path
+    # Remove '/' at beginning of path.
+    if s3_path.startswith('/'):
+        s3_path = s3_path[1:]
+    return bucket_name, s3_path
+
+
+def s3_request(func):
+    """
+    Wrapper function for s3 requests in order to create more helpful error
+    messages.
+    """
+
+    @wraps(func)
+    def wrapper(url, *args, **kwargs):
+        try:
+            return func(url, *args, **kwargs)
+        except ClientError as exc:
+            if int(exc.response['Error']['Code']) == 404:
+                raise EnvironmentError('file {} not found'.format(url))
+            else:
+                raise
+
+    return wrapper
+
+
+@s3_request
+def s3_etag(url):
+    """Check ETag on S3 object."""
+    s3_resource = boto3.resource('s3')
+    bucket_name, s3_path = split_s3_path(url)
+    s3_object = s3_resource.Object(bucket_name, s3_path)
+    return s3_object.e_tag
+
+
+@s3_request
+def s3_get(url, temp_file):
+    """Pull a file directly from S3."""
+    s3_resource = boto3.resource('s3')
+    bucket_name, s3_path = split_s3_path(url)
+    s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)
+
+
+def http_get(url, temp_file):
+    req = requests.get(url, stream=True)
+    content_length = req.headers.get('Content-Length')
+    total = int(content_length) if content_length is not None else None
+    progress = tqdm(unit='B', total=total)
+    for chunk in req.iter_content(chunk_size=1024):
+        if chunk:  # filter out keep-alive new chunks
+            progress.update(len(chunk))
+            temp_file.write(chunk)
+    progress.close()
+
+
+def get_from_cache(url, cache_dir=None):
+    """
+    Given a URL, look for the corresponding dataset in the local cache.
+    If it's not there, download it. Then return the path to the cached file.
+    """
+    if cache_dir is None:
+        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
+    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+
+    if not os.path.exists(cache_dir):
+        os.makedirs(cache_dir)
+
+    # Get eTag to add to filename, if it exists.
+    if url.startswith('s3://'):
+        etag = s3_etag(url)
+    else:
+        response = requests.head(url, allow_redirects=True)
+        if response.status_code != 200:
+            raise IOError(
+                'HEAD request failed for url {} with status code {}'.format(
+                    url, response.status_code))
+        etag = response.headers.get('ETag')
+
+    filename = url_to_filename(url, etag)
+
+    # get cache path to put the file
+    cache_path = os.path.join(cache_dir, filename)
+
+    if not os.path.exists(cache_path):
+        # Download to temporary file, then copy to cache dir once finished.
+        # Otherwise you get corrupt cache entries if the download gets interrupted.
+        with tempfile.NamedTemporaryFile() as temp_file:
+            logger.info('%s not found in cache, downloading to %s', url,
+                        temp_file.name)
+
+            # GET file object
+            if url.startswith('s3://'):
+                s3_get(url, temp_file)
+            else:
+                http_get(url, temp_file)
+
+            # we are copying the file before closing it, so flush to avoid truncation
+            temp_file.flush()
+            # shutil.copyfileobj() starts at the current position, so go to the start
+            temp_file.seek(0)
+
+            logger.info('copying %s to cache at %s', temp_file.name,
+                        cache_path)
+            with open(cache_path, 'wb') as cache_file:
+                shutil.copyfileobj(temp_file, cache_file)
+
+            logger.info('creating metadata file for %s', cache_path)
+            meta = {'url': url, 'etag': etag}
+            meta_path = cache_path + '.json'
+            with open(meta_path, 'w', encoding='utf-8') as meta_file:
+                json.dump(meta, meta_file)
+
+            logger.info('removing temp file %s', temp_file.name)
+
+    return cache_path
+
+
+def read_set_from_file(filename):
+    '''
+    Extract a de-duped collection (set) of text from a file.
+    Expected file format is one item per line.
+    '''
+    collection = set()
+    with open(filename, 'r', encoding='utf-8') as file_:
+        for line in file_:
+            collection.add(line.rstrip())
+    return collection
+
+
+def get_file_extension(path, dot=True, lower=True):
+    ext = os.path.splitext(path)[1]
+    ext = ext if dot else ext[1:]
+    return ext.lower() if lower else ext
diff --git a/modelscope/models/nlp/mglm/data_utils/lazy_loader.py b/modelscope/models/nlp/mglm/data_utils/lazy_loader.py
new file mode 100644
index 00000000..77a77a8a
--- /dev/null
+++ b/modelscope/models/nlp/mglm/data_utils/lazy_loader.py
@@ -0,0 +1,286 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""utils for loading text from disk"""
+import mmap
+import os
+import pickle as pkl
+import time
+from itertools import accumulate
+
+import numpy as np
+import torch
+from torch.multiprocessing import Lock
+
+
+def get_lazy_path(path):
+    """
+    Gets directory path where lazy files are stored.
+    """
+    return os.path.splitext(path)[0] + '.lazy'
+
+
+def exists_lazy(path, data_type='data'):
+    """
+    Check if we've already made a lazy version of this file for the `data_type` field.
+    """
+    if not os.path.exists(get_lazy_path(path)):
+        return False
+    contents = os.listdir(get_lazy_path(path))
+    if data_type not in contents:
+        return False
+    if data_type + '.len.pkl' not in contents:
+        return False
+    return True
+
+
+def get_scatter_path(path, scatter_rank):
+    path = os.path.splitext(path)[0] + '.scatter'
+    scatter_path = os.path.join(path, str(scatter_rank))
+    return scatter_path
+
+
+def exists_scatter(path, scatter_num=64, data_type='data'):
+    for i in range(scatter_num):
+        scatter_path = get_scatter_path(path, scatter_rank=i)
+        if not exists_lazy(scatter_path, data_type=data_type):
+            return False
+    return True
+
+
+class LazyWriter:
+
+    def __init__(self,
+                 path,
+                 data_type,
+                 is_array=False,
+                 array_data_type=np.int32):
+        lazypath = get_lazy_path(path)
+        if not os.path.exists(lazypath):
+            os.makedirs(lazypath)
+        self.datapath = os.path.join(lazypath, data_type)
+        self.lenpath = os.path.join(lazypath, data_type + '.len.pkl')
+        self.array_data_type = array_data_type
+        self.output = open(self.datapath, 'wb')
+        self.lengths = []
+        self.is_array = is_array
+
+    @staticmethod
+    def get_len_path(path, data_type):
+        lazypath = get_lazy_path(path)
+        return os.path.join(lazypath, data_type + '.len.pkl')
+
+    def write(self, s):
+        if isinstance(s, dict):
+            s = s['text']
+        if self.is_array:
+            encoded = np.array(
+                s, dtype=self.array_data_type).tobytes(order='C')
+            self.output.write(encoded)
+            self.lengths.append(len(s))
+        else:
+            encoded = s.encode('utf-8')
+            self.output.write(encoded)
+            self.lengths.append(len(encoded))
+
+    def close(self):
+        self.output.close()
+        with open(self.lenpath, 'wb') as f:
+            pkl.dump(self.lengths, f)
+
+
+def split_strings(strings, start, chr_lens):
+    """
+    Split strings based on string lengths and given start.
+    """
+    return [
+        strings[i - start:j - start]
+        for i, j in zip([start] + chr_lens[:-1], chr_lens)
+    ]
+
+
+class ProcessorTokenizer:
+    """
+    callable class that runs a preprocessing, as well as tokenization step,
+    on input text.
+    """
+
+    def __init__(self, tokenizer, process_fn=None):
+        self.tokenizer = tokenizer
+        self.process_fn = process_fn
+
+    def __call__(self, string):
+        if self.tokenizer is not None:
+            string = self.tokenizer(string, process_fn=self.process_fn)
+        elif self.process_fn is not None:
+            string = self.process_fn(string)
+        return string
+
+
+class LazyLoader(object):
+    """
+    Arguments:
+        path: path to directory where array entries are concatenated into one big string file
+            and the .len file are located
+        data_type (str): Some datsets have multiple fields that are stored in different paths.
+            `data_type` specifies which of these fields to load in this class
+        mem_map  (boolean): Specifies whether to memory map file `path`
+        map_fn (callable): Fetched strings are passed through map_fn before being returned.
+
+    Example of lazy loader directory structure:
+    file.json
+    file.lazy/
+        data_type1
+        data_type1.len.pkl
+        data_type2
+        data_type2.len.pkl
+    """
+
+    def __init__(self,
+                 path,
+                 data_type='data',
+                 mem_map=False,
+                 map_fn=None,
+                 is_array=False,
+                 array_data_type=np.int32,
+                 load_memory=False,
+                 half_load=False):
+        lazypath = get_lazy_path(path)
+        datapath = os.path.join(lazypath, data_type)
+        # get file where array entries are concatenated into one big string
+        self._file = open(datapath, 'rb')
+        self.file = self._file
+        self.is_array = is_array
+        self.array_data_type = array_data_type
+        # memory map file if necessary
+        lenpath = os.path.join(lazypath, data_type + '.len.pkl')
+        self.lens = pkl.load(open(lenpath, 'rb'))
+        if half_load:
+            self.lens = self.lens[:2 * len(self.lens) // 3]
+        self.ends = list(accumulate(self.lens))
+        self.dumb_ends = list(self.ends)
+        self.mem_map = mem_map
+        self.load_memory = load_memory
+        if self.load_memory:
+            data_type_size = np.dtype(self.array_data_type).itemsize
+            if half_load:
+                self.file = self.file.read(sum(self.lens) * data_type_size)
+            else:
+                self.file = self.file.read()
+            self.file = np.ndarray(
+                shape=(len(self.file) // data_type_size, ),
+                dtype=array_data_type,
+                buffer=self.file,
+                order='C')
+        elif self.mem_map:
+            if is_array:
+                if self.ends[-1] == 0:
+                    self.file = np.array([], dtype=array_data_type)
+                else:
+                    self.file = np.memmap(
+                        self.file, dtype=array_data_type, mode='r', order='C')
+            else:
+                if self.ends[-1] == 0:
+                    self.file = bytearray()
+                else:
+                    self.file = mmap.mmap(
+                        self.file.fileno(), 0, prot=mmap.PROT_READ)
+        self.read_lock = Lock()
+        self.process_fn = map_fn
+        self.map_fn = map_fn
+        self._tokenizer = None
+        self.is_lazy = True
+
+    def SetTokenizer(self, tokenizer):
+        """
+        logic to set and remove (set to None) tokenizer.
+        combines preprocessing/tokenization into one callable.
+        """
+        if tokenizer is None:
+            if not hasattr(self, '_tokenizer'):
+                self._tokenizer = tokenizer
+        else:
+            self._tokenizer = tokenizer
+        self.map_fn = ProcessorTokenizer(tokenizer, self.process_fn)
+
+    def GetTokenizer(self):
+        return self._tokenizer
+
+    def __getitem__(self, index):
+        """
+        read file and splice strings based on string ending array `self.ends`
+        """
+        if not isinstance(index, slice):
+            if index == 0:
+                start = 0
+            else:
+                start = self.ends[index - 1]
+            end = self.ends[index]
+            rtn = self.file_read(start, end)
+            if self.map_fn is not None:
+                rtn = self.map_fn(rtn)
+        else:
+            # if slice, fetch strings with 1 diskread and then splice in memory
+            chr_lens = self.ends[index]
+            if index.start == 0 or index.start is None:
+                start = 0
+            else:
+                start = self.ends[index.start - 1]
+            stop = chr_lens[-1]
+            strings = self.file_read(start, stop)
+            rtn = split_strings(strings, start, chr_lens)
+            if self.map_fn is not None:
+                rtn = [self.map_fn(s) for s in rtn]
+        return rtn
+
+    def __len__(self):
+        return len(self.ends)
+
+    def file_read(self, start=0, end=None):
+        """read specified portion of file"""
+        data_type_size = np.dtype(self.array_data_type).itemsize
+        # atomic reads to avoid race conditions with multiprocess dataloader
+        self.read_lock.acquire()
+        if not self.mem_map and not self.load_memory:
+            # seek to start of file read
+            if self.is_array:
+                start = start * data_type_size
+                end = end * data_type_size if end is not None else None
+            self.file.seek(start)
+            # read to end of file if no end point provided
+            if end is None:
+                rtn = self.file.read()
+            # else read amount needed to reach end point
+            else:
+                rtn = self.file.read(end - start)
+            if self.is_array:
+                rtn = np.ndarray(
+                    shape=(len(rtn) // data_type_size, ),
+                    dtype=self.array_data_type,
+                    buffer=rtn,
+                    order='C')
+            else:
+                rtn = rtn.decode('utf-8', 'ignore')
+        else:
+            rtn = self.file[start:end]
+            if self.is_array:
+                rtn = rtn.copy()
+            else:
+                rtn = rtn.decode('utf-8', 'strict')
+        self.read_lock.release()
+        # TODO: @raulp figure out mem map byte string bug
+        # if mem map'd need to decode byte string to string
+        # # rtn = str(rtn)
+        # if self.mem_map:
+        #     rtn = rtn.decode('unicode_escape')
+        return rtn
diff --git a/modelscope/models/nlp/mglm/data_utils/samplers.py b/modelscope/models/nlp/mglm/data_utils/samplers.py
new file mode 100644
index 00000000..c0f6e1ab
--- /dev/null
+++ b/modelscope/models/nlp/mglm/data_utils/samplers.py
@@ -0,0 +1,190 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""batch samplers that work with either random or sequential data samplers"""
+import math
+import os
+import sys
+
+import numpy as np
+import torch
+from torch.utils import data
+
+
+class RandomSampler(data.sampler.Sampler):
+    r"""
+    Based off of pytorch RandomSampler and DistributedSampler. Essentially a RandomSampler,
+    but this class lets the user set an epoch like DistributedSampler
+    Samples elements randomly. If without replacement, then sample from a shuffled dataset.
+    If with replacement, then user can specify ``num_samples`` to draw.
+    Arguments:
+        data_source (Dataset): dataset to sample from
+        num_samples (int): number of samples to draw, default=len(dataset)
+        replacement (bool): samples are drawn with replacement if ``True``, default=False
+    """
+
+    def __init__(self, data_source, replacement=False, num_samples=None):
+        super(RandomSampler, self).__init__(data_source)
+        self.data_source = data_source
+        self.replacement = replacement
+        self._num_samples = num_samples
+        self.epoch = -1
+
+        if self._num_samples is not None and replacement is False:
+            raise ValueError(
+                'With replacement=False, num_samples should not be specified, '
+                'since a random permute will be performed.')
+
+        if not isinstance(self.num_samples, int) or self.num_samples <= 0:
+            raise ValueError('num_samples should be a positive integer '
+                             'value, but got num_samples={}'.format(
+                                 self.num_samples))
+        if not isinstance(self.replacement, bool):
+            raise ValueError('replacement should be a boolean value, but got '
+                             'replacement={}'.format(self.replacement))
+
+    @property
+    def num_samples(self):
+        # dataset size might change at runtime
+        if self._num_samples is None:
+            return len(self.data_source)
+        return self._num_samples
+
+    def __iter__(self):
+        n = len(self.data_source)
+        g = torch.Generator()
+        if self.epoch >= 0:
+            g.manual_seed(self.epoch)
+        if self.replacement:
+            for _ in range(self.num_samples // 32):
+                yield from torch.randint(
+                    high=n, size=(32, ), dtype=torch.int64,
+                    generator=g).tolist()
+            yield from torch.randint(
+                high=n,
+                size=(self.num_samples % 32, ),
+                dtype=torch.int64,
+                generator=g).tolist()
+        else:
+            yield from torch.randperm(n, generator=self.generator).tolist()
+
+    def __len__(self):
+        return self.num_samples
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+
+
+class DistributedSequentialSampler(data.sampler.Sampler):
+
+    def __init__(self,
+                 num_samples,
+                 train_iters,
+                 batch_size,
+                 rank=-1,
+                 world_size=2):
+        super().__init__(num_samples)
+        if rank == -1:
+            rank = 0
+            world_size = 1
+        self.num_samples = num_samples
+        self.rank = rank
+        self.world_size = world_size
+        self.start_iter = 0
+        self.train_iters = train_iters
+        self.batch_size = batch_size
+        self.batch_bias = [
+            i * (num_samples // batch_size) for i in range(batch_size)
+        ]
+
+    def __iter__(self):
+        for idx in range(self.start_iter, self.train_iters * 10):
+            batch = [(idx + bias) % self.num_samples
+                     for bias in self.batch_bias]
+            tbatch = self._batch(batch)
+            yield tbatch
+
+    def __len__(self):
+        return self.train_iters
+
+    def _batch(self, batch):
+        """extracts samples only pertaining to this worker's batch"""
+        start = self.rank * self.batch_size // self.world_size
+        end = (self.rank + 1) * self.batch_size // self.world_size
+        return batch[start:end]
+
+
+class DistributedBatchSampler(data.sampler.BatchSampler):
+    """
+    similar to normal implementation of distributed sampler, except implementation is at the
+    batch sampler level, instead of just the sampler level. This allows wrapping of arbitrary
+    data samplers (sequential, random, WeightedRandomSampler, etc.) with this batch sampler.
+    """
+
+    def __init__(self,
+                 sampler,
+                 batch_size,
+                 drop_last,
+                 rank=-1,
+                 world_size=2,
+                 wrap_last=False,
+                 gradient_accumulation_steps=None):
+        super(DistributedBatchSampler, self).__init__(sampler, batch_size,
+                                                      drop_last)
+        if rank == -1:
+            assert False, 'should not be here'
+        self.rank = rank
+        self.world_size = world_size
+        self.sampler.wrap_around = 0
+        self.wrap_around = 0
+        self.wrap_last = wrap_last
+        self.start_iter = 0
+        self.effective_batch_size = batch_size if gradient_accumulation_steps is None else batch_size * gradient_accumulation_steps  # noqa
+
+    def __iter__(self):
+        batch = []
+        i = 0
+        for idx in self.data_iterator(self.sampler, wrap_around=False):
+            batch.append(idx)
+            if len(batch) == self.batch_size:
+                tbatch = self._batch(batch)
+                if i >= self.start_iter * self.effective_batch_size:
+                    yield tbatch
+                    self.start_iter = 0
+                i += len(batch)
+                batch = []
+        batch_len = len(batch)
+        if batch_len > 0 and not self.drop_last:
+            if self.wrap_last:
+                self.sampler.wrap_around -= (self.batch_size)
+                self.wrap_around += (len(batch))
+                self.wrap_around %= self.batch_size
+            yield self._batch(batch)
+        if self.wrap_last:
+            self.sampler.wrap_around += self.batch_size
+
+    def data_iterator(self, _iter, wrap_around=False):
+        """iterates through data and handles wrap around"""
+        for i, idx in enumerate(_iter):
+            if i < self.wrap_around % self.batch_size:
+                continue
+            if wrap_around:
+                self.wrap_around += 1
+                self.wrap_around %= self.batch_size
+            yield idx
+
+    def _batch(self, batch):
+        """extracts samples only pertaining to this worker's batch"""
+        start = self.rank * self.batch_size // self.world_size
+        end = (self.rank + 1) * self.batch_size // self.world_size
+        return batch[start:end]
diff --git a/modelscope/models/nlp/mglm/data_utils/sp_tokenizer.py b/modelscope/models/nlp/mglm/data_utils/sp_tokenizer.py
new file mode 100644
index 00000000..b4d1afe3
--- /dev/null
+++ b/modelscope/models/nlp/mglm/data_utils/sp_tokenizer.py
@@ -0,0 +1,158 @@
+# Modified by Zhipu.AI
+"""
+from https://github.com/openai/gpt-2/, changed for chinese
+"""
+import os  # yapf: disable
+
+
+"""
+SentencePiece is an unsupervised text tokenizer and detokenizer mainly for Neural Network-based text generation
+systems where the vocabulary size is predetermined prior to the neural model training. SentencePiece implements
+subword units (e.g., byte-pair-encoding (BPE) [Sennrich et al.]) and unigram language model [Kudo.]) with the
+extension of direct training from raw sentences. SentencePiece allows us to make a purely end-to-end
+system that does not depend on language-specific pre/postprocessing.
+https://github.com/google/sentencepiece
+
+pip install sentencepiece
+
+or  git clone https://github.com/google/sentencepiece.git
+python setup.py install
+
+"""
+
+
+def get_pairs(word):
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+class Encoder:
+
+    def __init__(self, encoder, bpe_merges):
+        self.encoder = encoder
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+        self.cache = {}
+        self.max_len = 0
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token)
+        pairs = get_pairs(word)
+        if not pairs:
+            return token
+
+        while True:
+            bigram = min(
+                pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:  # noqa
+                    new_word.extend(word[i:])
+                    break
+
+                if word[i] == first and i < len(word) - 1 and word[
+                        i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+
+    def encode(self, text):
+        return [self.encoder.get(token, 1) for token in self.tokenize(text)]
+
+    def decode(self, tokens):
+        text = ''.join([self.decoder[token] for token in tokens])
+        return text
+
+    def tokenize(self, text):
+        bpe_tokens = []
+        bpe_tokens.extend(bpe_token for bpe_token in self.bpe(text).split(' '))
+        return bpe_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        return [self.encoder.get(token, 1) for token in tokens]
+
+
+class Encoder_SP:
+
+    def __init__(self, model_path):
+        import sentencepiece as spm
+        self.sp = spm.SentencePieceProcessor()
+        self.sp.Load(model_path)
+
+    def encode(self, text):
+        """
+        text="...."
+        """
+        return self.sp.EncodeAsIds(text)
+
+    def decode(self, tokens):
+        """
+        tokens=[x1,x2,...]
+        """
+        text = [int(token) for token in tokens]
+        # print(text)
+        return self.sp.DecodeIds(text)
+
+    def tokenize(self, text):
+        return self.sp.EncodeAsPieces(text)
+
+    def convert_tokens_to_ids(self, tokens):
+        return [self.sp.PieceToId(token) for token in tokens]
+
+    def convert_token_to_id(self, token):
+        return self.sp.PieceToId(token)
+
+    def convert_id_to_token(self, idx):
+        return self.sp.IdToPiece(idx)
+
+
+def get_encoder(encoder_file, bpe_file):
+    import json
+    filepath, filename = os.path.split(encoder_file)
+    shotname, extension = os.path.splitext(filename)
+
+    if ('.model' == extension) and (bpe_file == ''):
+        return Encoder_SP(encoder_file)
+    else:
+        with open(encoder_file, 'r', encoding='utf-8') as f:
+            encoder = json.load(f)
+        with open(bpe_file, 'r', encoding='utf-8') as f:
+            bpe_data = f.read()
+        bpe_merges = [
+            tuple(merge_str.split())
+            for merge_str in bpe_data.split('\n')[1:-1]
+        ]
+        return Encoder(
+            encoder=encoder,
+            bpe_merges=bpe_merges,
+        )
+
+
+def from_pretrained(model_path):
+    return get_encoder(model_path + '/tokenizer/mglm250k/mglm250k-uni.model',
+                       '')
diff --git a/modelscope/models/nlp/mglm/data_utils/tokenization.py b/modelscope/models/nlp/mglm/data_utils/tokenization.py
new file mode 100755
index 00000000..c89cc371
--- /dev/null
+++ b/modelscope/models/nlp/mglm/data_utils/tokenization.py
@@ -0,0 +1,1396 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utilities for using and training tokenizers (char, wordpiece, sentencepiece)"""
+import csv
+import itertools
+import os
+import random
+from collections import namedtuple
+
+import nltk
+import regex as re
+import sentencepiece as spm
+import torch
+from nltk import tokenize as nltk_tokenize
+
+from . import sp_tokenizer
+from .tokenization_gpt2 import GPT2Tokenizer
+from .wordpiece import PRETRAINED_VOCAB_ARCHIVE_MAP, BertTokenizer
+
+
+def make_tokenizer(tokenizer_type,
+                   corpus,
+                   model_path=None,
+                   vocab_size=None,
+                   model_type=None,
+                   pad_token=0,
+                   character_coverage=1.0,
+                   command_tokens=None,
+                   type_tokens=None,
+                   **kwargs):
+    """
+    Helper function to instantiate a tokenizer given common combinations of options.
+    """
+    tokenizer_class = tokenizer_type
+    if isinstance(tokenizer_class, str):
+        tokenizer_class = eval(tokenizer_class)
+    if tokenizer_class is BertWordPieceTokenizer:
+        return BertWordPieceTokenizer(model_type, **kwargs)
+    elif tokenizer_class is GPT2BPETokenizer:
+        if model_type is None:
+            model_type = 'gpt2'
+        return GPT2BPETokenizer(model_type, **kwargs)
+    elif tokenizer_class is ChineseSPTokenizer:
+        return ChineseSPTokenizer(model_path, **kwargs)
+    text_tokenizer = tokenizer_class(
+        corpus=corpus,
+        vocab_size=vocab_size,
+        model_path=model_path,
+        model_type=model_type,
+        pad_token=pad_token,
+        character_coverage=character_coverage)
+    return Tokenizer(text_tokenizer, command_tokens, type_tokens)
+
+
+class Tokenization(object):
+    """
+    Tokenization object to hold tokenization, (processed text),and original
+    text. Can hold tokenization as Ids or tokens.
+
+    It also holds command tokens (pad, unk, etc.) for the tokenization.
+    This allows functions to pad/operate on tokenizations without having
+    access to the full tokenizer, just the tokenization.
+
+    Several standard array operations are implemented (insert, append, extend).
+    """
+
+    def __init__(self,
+                 tokenization,
+                 text=None,
+                 original_text=None,
+                 command_tokens=None,
+                 asIds=True):
+        self.tokenization = tokenization
+        self.text = text
+        if self.text is None:
+            self.text = self.tokenization
+        self.original_text = original_text
+        if self.original_text is None:
+            self.original_text = self.text
+        self.command_tokens = command_tokens
+        self.asIds = asIds
+        self.parse_command_tokens()
+
+    def set_command_tokens(self, command_tokens):
+        self.command_tokens = command_tokens
+        return self.parse_command_tokens()
+
+    def parse_command_tokens(self):
+        if self.command_tokens is None:
+            return
+        for command_token in self.command_tokens:
+            if self.asIds:
+                setattr(self, command_token.name, command_token.Id)
+            else:
+                setattr(self, command_token.name, command_token.token)
+
+    def __getitem__(self, index):
+        return self.tokenization[index]
+
+    def __len__(self):
+        return len(self.tokenization)
+
+    def insert(self, idx, other):
+        if isinstance(other, (CommandToken, TypeToken)):
+            self.tokenization.insert(idx, other.Id)
+            if idx == 0:
+                self.text = other.token + self.text
+                self.original_text = other.token + self.original_text
+            elif idx == len(self.tokenization) - 1:
+                self.text += other.token
+                self.original_text += other.token
+        elif isinstance(other, Tokenization):
+            self.tokenization = self.tokenization[:
+                                                  idx] + other.tokenization + self.tokenization[
+                                                      idx:]
+        else:
+            self.tokenization = self.tokenization[:
+                                                  idx] + other.tokenization + self.tokenization[
+                                                      idx:]
+
+    def append(self, other):
+        if isinstance(other, (CommandToken, TypeToken)):
+            self.tokenization.append(other.Id)
+            self.text += other.token
+            self.original_text += other.token
+        elif isinstance(other, Tokenization):
+            self.tokenization.extend(other.tokenization)
+            self.text += other.text
+            self.original_text += other.original_text
+        else:
+            self.tokenization.append(other)
+        return self
+
+    def extend(self, other):
+        if isinstance(other, (CommandToken, TypeToken)):
+            self.tokenization.append(other.Id)
+            self.text += other.token
+            self.original_text += other.token
+        elif isinstance(other, list) and isinstance(other[0],
+                                                    (CommandToken, TypeToken)):
+            self.tokenization.extend([o.Id for o in other])
+            self.text += [o.token for o in other]
+            self.original_text += [o.token for o in other]
+        elif isinstance(other, Tokenization):
+            self.tokenization.extend(other.tokenization)
+            self.text += other.text
+            self.original_text += other.original_text
+        else:
+            self.tokenization.extend(other)
+        return self
+
+
+"""define some default command tokens for the tokenizer to use"""
+token_format = '<{0}>'
+
+COMMAND_TUPLE = namedtuple('CommandToken', ('name', 'token', 'Id'))
+
+
+def prep_command_tokens(tokenlist, token_format=token_format):
+    return [
+        CommandToken(tok[0], token_format.format(tok[0]), tok[1])
+        for tok in tokenlist
+    ]
+
+
+class CommandToken(object):
+
+    def __init__(self, name, token, Id, lstrip=False, rstrip=False):
+        self.name = name
+        self.token = token
+        self.Id = Id
+        self.lstrip = lstrip
+        self.rstrip = rstrip
+
+    def __str__(self):
+        return str(COMMAND_TUPLE(self.name, self.token, self.Id))
+
+
+DEFAULT_COMMAND_TOKENS = [
+    ('pad', 0),
+    ('eos', 1),
+    ('bos', 2),
+    ('unk', 3),
+    ('sep', 4),
+    ('L2R', 5),
+    ('ENC', 6),
+    ('MASK', 7),
+]
+DEFAULT_COMMAND_TOKENS = prep_command_tokens(DEFAULT_COMMAND_TOKENS)
+"""define some default type tokens for bert training"""
+
+TYPE_TUPLE = namedtuple('TypeToken', ('name', 'token', 'Id'))
+
+
+def prep_type_tokens(tokenlist, token_format=token_format):
+    return [
+        TypeToken(tok[0], token_format.format(tok[0]), tok[1])
+        for tok in tokenlist
+    ]
+
+
+class TypeToken(object):
+
+    def __init__(self, name, token, Id):
+        self.name = name
+        self.token = token
+        self.Id = Id
+
+    def __str__(self):
+        return str(TYPE_TUPLE(self.name, self.token, self.Id))
+
+
+DEFAULT_TYPE_TOKENS = [
+    ('function', 0),
+    ('command', 1),
+    ('str0', 2),
+    ('str1', 3),
+    ('str2', 4),
+    ('embedding0', 5),
+    ('embedding1', 6),
+    ('embedding2', 7),
+    ('arg0', 8),
+    ('arg1', 9),
+    ('arg2', 10),
+]
+DEFAULT_TYPE_TOKENS = prep_type_tokens(DEFAULT_TYPE_TOKENS)
+
+
+class Tokenizer(object):
+    """
+    Tokenizer object that handles text tokenization, command tokens, and type tokens.
+
+    Command tokens and text tokens are stored together in one mapping of size
+    `len(text_tokenizer)+len(command_tokens)`. Command tokens are stored as first
+    `len(command_tokens)` tokens. Token idx is stored at `idx+len(command_tokens)`.
+
+    Token types are stored in a separate mapping of size `len(type_tokens)`.
+    """
+
+    def __init__(self, text_tokenizer, command_tokens=None, type_tokens=None):
+        # set text tokenizer
+        self.text_tokenizer = text_tokenizer
+        if not hasattr(self, 'num_text_tokens'):
+            self.num_text_tokens = len(self.text_tokenizer)
+
+        # set command tokens
+        if command_tokens is None:
+            command_tokens = DEFAULT_COMMAND_TOKENS
+        self._command_tokens = command_tokens
+        self.command_name_map = {tok.name: tok for tok in self._command_tokens}
+        self.command_token_map = {
+            tok.token: tok
+            for tok in self._command_tokens
+        }
+        self.command_id_map = {tok.Id: tok for tok in self._command_tokens}
+        if not hasattr(self, 'num_command_tokens'):
+            self.num_command_tokens = len(self._command_tokens)
+        if not hasattr(self, 'num_tokens'):
+            self.num_tokens = self.num_command_tokens + self.num_text_tokens
+
+        # set type tokens
+        if type_tokens is None:
+            type_tokens = DEFAULT_TYPE_TOKENS
+        self.type_tokens = type_tokens
+        self.type_name_map = {tok.name: tok for tok in self.type_tokens}
+        self.type_token_map = {tok.token: tok for tok in self.type_tokens}
+        self.type_id_map = {tok.Id: tok for tok in self.type_tokens}
+        if not hasattr(self, 'num_type_tokens'):
+            self.num_type_tokens = len(self.type_tokens)
+
+        # parse tokens and vocabs from tokenizer
+        self._tokens = list(self.command_token_map.keys()) + list(
+            self.text_tokenizer.tokens)
+        self._vocab = {t: Id for Id, t in self.command_id_map.items()}
+        self._vocab.update({
+            t: Id + self.num_command_tokens
+            for t, Id in self.text_tokenizer.vocab.items()
+        })
+
+        self._text_tokens = list(self.text_tokenizer.tokens)
+        self._text_token_vocab = {
+            t: Id + self.num_command_tokens
+            for t, Id in self.text_tokenizer.vocab.items()
+        }
+
+        self._command_token_tokens = list(self.command_token_map.keys())
+        self._command_token_vocab = {
+            t: Id
+            for Id, t in self.command_id_map.items()
+        }
+
+        self._token_types = list(self.type_token_map.keys())
+        self._token_type_vocab = {t: Id for Id, t in self.type_id_map.items()}
+
+    def __call__(self, text, process_fn=None):
+        """run preprocessing and encode text as Ids"""
+        return self.EncodeAsIds(text, process_fn=process_fn)
+
+    def __len__(self):
+        """total number of tokens"""
+        return self.num_tokens
+
+    def get_command(self, name):
+        """get command token corresponding to `name`"""
+        return self.command_name_map[name]
+
+    def get_type(self, name):
+        """get type token corresponding to `name`"""
+        return self.type_name_map[name]
+
+    @property
+    def tokens(self):
+        """list (or iterable) of all tokens for tokenizer"""
+        return self._tokens
+
+    @property
+    def vocab(self):
+        """dictionary mapping tokens to ids for tokenizer"""
+        return self._vocab
+
+    @property
+    def token_types(self):
+        """list (or iterable) of all token types for tokenizer"""
+        return self._token_types
+
+    @property
+    def token_type_vocab(self):
+        """dictionary mapping token types to ids for tokenizer"""
+        return self._token_type_vocab
+
+    @property
+    def command_tokens(self):
+        """list (or iterable) of all command tokens for tokenizer"""
+        return self._command_token_tokens
+
+    @property
+    def command_token_vocab(self):
+        """dictionary mapping command tokens to ids for tokenizer"""
+        return self._command_token_vocab
+
+    @property
+    def text_tokens(self):
+        """list (or iterable) of text tokens for text tokenizer"""
+        return self._text_tokens
+
+    @property
+    def text_token_vocab(self):
+        """dictionary mapping text tokens to ids for text tokenizer"""
+        return self._text_token_vocab
+
+    def EncodeAsIds(self, text, process_fn=None):
+        """
+        encode text using text tokenizer and shift Id values for command tokens
+        """
+        processed_text = text
+        if process_fn is not None:
+            processed_text = process_fn(processed_text)
+
+        def split_on_token(tok_extended: CommandToken, text):
+            result = []
+            tok = tok_extended.token
+            split_text = text.split(tok)
+            for i, sub_text in enumerate(split_text):
+                # CommandToken can control whitespace stripping around them.
+                # We use them for GPT2 and Roberta to have different behavior depending on the special token
+                # Cf. https://github.com/huggingface/transformers/pull/2778
+                # and https://github.com/huggingface/transformers/issues/3788
+                # Strip white spaces on the right
+                if tok_extended.rstrip and i > 0:
+                    # A bit counter-intuitive but we strip the left of the string
+                    # since tok_extended.rstrip means the special token is eating all white spaces on its right
+                    sub_text = sub_text.lstrip()
+                # Strip white spaces on the left
+                if tok_extended.lstrip and i < len(split_text) - 1:
+                    sub_text = sub_text.rstrip()  # Opposite here
+
+                if i == 0 and not sub_text:
+                    result.append(tok)
+                elif i == len(split_text) - 1:
+                    if sub_text:
+                        result.append(sub_text)
+                    else:
+                        pass
+                else:
+                    if sub_text:
+                        result.append(sub_text)
+                    result.append(tok)
+            return result
+
+        def split_on_tokens(tok_list, text):
+            if not text.strip():
+                return []
+            if not tok_list:
+                return self.text_tokenizer.encode(text)
+
+            tokenized_text = []
+            text_list = [text]
+            for tok in tok_list:
+                tokenized_text = []
+                for sub_text in text_list:
+                    if sub_text not in self._command_token_tokens:
+                        tokenized_text.extend(split_on_token(tok, sub_text))
+                    else:
+                        tokenized_text.append(sub_text)
+                text_list = tokenized_text
+
+            return list(
+                itertools.chain.from_iterable(
+                    (self._encode(token)
+                     if token not in self._command_token_tokens else
+                     [self.command_token_map[token].Id]
+                     for token in tokenized_text)))
+
+        no_split_tokens = self._command_tokens
+        Ids = split_on_tokens(no_split_tokens, processed_text)
+        tokenization = Tokenization(Ids, processed_text, text)
+        tokenization.set_command_tokens(self._command_tokens)
+        return tokenization
+
+    def _encode(self, text):
+        raise NotImplementedError
+
+    def EncodeAsTokens(self, text, process_fn=None):
+        """
+        encode text as tokens using text tokenizer
+        """
+        tokenization = self.text_tokenizer.EncodeAsTokens(
+            text, process_fn=process_fn)
+        tokenization.set_command_tokens(self._command_tokens)
+        return tokenization
+
+    def IdToToken(self, Id, type_token=False):
+        """convert Id to token accounting for command and type tokens"""
+        if isinstance(Id, (TypeToken, CommandToken)):
+            return Id.token
+        if type_token:
+            return self.type_id_map[Id].token
+        if Id < self.num_command_tokens:
+            return self.command_id_map[Id].token
+        return self.text_tokenizer.IdToToken(Id - self.num_command_tokens)
+
+    def TokenToId(self, token, type_token=False):
+        """convert token to Id accounting for command and type tokens"""
+        if isinstance(token, (TypeToken, CommandToken)):
+            return token.Id
+        if type_token:
+            return self.type_token_map[token].Id
+        if token in self.command_token_map:
+            return self.command_token_map[token].Id
+        return self.text_tokenizer.TokenToId(token) + self.num_command_tokens
+
+    def DecodeIds(self, Ids, type_token=False):
+        """
+        convert Ids to tokens accounting for command and type tokens, tokens
+        are joined and returned as a string.
+        """
+        if type_token:
+            return ' '.join(
+                Id.token if isinstance(Id, TypeToken) else self.
+                type_id_map[Id].token for Id in Ids)
+        rtn_strs = []
+        current_str = []
+        if isinstance(Ids, Tokenization):
+            Ids = Ids.tokenization
+        for Id in Ids:
+            if isinstance(Id, CommandToken):
+                rtn_strs.append(self.text_tokenizer.DecodeIds(current_str))
+                current_str = []
+                rtn_strs.append(Id.token)
+            elif Id < self.num_command_tokens:
+                rtn_strs.append(self.text_tokenizer.DecodeIds(current_str))
+                current_str = []
+                rtn_strs.append(self.command_id_map[Id].token)
+            else:
+                current_str.append(Id - self.num_command_tokens)
+        if current_str != []:
+            rtn_strs.append(self.text_tokenizer.DecodeIds(current_str))
+        return ' '.join(rtn_strs)
+
+    def DecodeTokens(self, Tokens, type_token=False):
+        """
+        convert tokens to a string accounting for command and type tokens.
+        """
+        if type_token:
+            return ' '.join(
+                t.token if isinstance(t, TypeToken) else t for t in Tokens)
+        rtn_strs = []
+        current_str = []
+        if isinstance(Tokens, Tokenization):
+            Tokens = Tokens.tokenization
+        for t in Tokens:
+            if isinstance(t, CommandToken):
+                rtn_strs.append(self.text_tokenizer.DecodeTokens(current_str))
+                current_str = []
+                rtn_strs.append(t.token)
+            elif t in self.command_token_map:
+                rtn_strs.append(self.text_tokenizer.DecodeTokens(current_str))
+                current_str = []
+                rtn_strs.append(t)
+            else:
+                current_str.append(t)
+        if current_str != []:
+            rtn_strs.append(self.text_tokenizer.DecodeTokens(current_str))
+        return ' '.join(rtn_strs)
+
+
+class TextTokenizer(object):
+    """
+    Interface for text tokenizer
+    """
+
+    def __init__(self):
+        if not hasattr(self, 'num_text_tokens'):
+            self.num_text_tokens = 0
+        if not hasattr(self, 'num_tokens'):
+            self.num_tokens = self.num_text_tokens
+
+    def __call__(self, text, process_fn=None):
+        return self.EncodeAsIds(text, process_fn)
+
+    def __len__(self):
+        return self.num_text_tokens
+
+    @property
+    def tokens(self):
+        """list (or iterable) of text tokens for text tokenizer"""
+        raise NotImplementedError(
+            'TextTokenizer tokens property not implemented')
+
+    @property
+    def vocab(self):
+        """dictionary mapping tokens to ids"""
+        raise NotImplementedError(
+            'TextTokenizer vocab property not implemented')
+
+    @staticmethod
+    def exists(model_path):
+        """check if the filepath for a text tokenizer exists"""
+        raise NotImplementedError(
+            'TextTokenizer exists method not implemented')
+
+    def Train(self, corpus):
+        """train a tokenizer on a data corpus and save model for future use"""
+        raise NotImplementedError('TextTokenizer Train not implemented')
+
+    def EncodeAsIds(self, text, process_fn=None):
+        """
+        Preprocess text and encode as ids. Return a tokenization object with
+        original text, processed text, and id tokenization.
+        """
+        raise NotImplementedError('TextTokenizer EncodeAsIds not implemented')
+
+    def EncodeAsTokens(self, text, process_fn=None):
+        """
+        Preprocess text and encode as tokens. Return a tokenization object with
+        original text, processed text, and token tokenization.
+        """
+        raise NotImplementedError(
+            'TextTokenizer EncodeAsTokens not implemented')
+
+    def IdToToken(self, Id):
+        """Convert an Id to Token. Reverse lookup of self.vocab"""
+        raise NotImplementedError('TextTokenizer IdToToken not implemented')
+
+    def TokenToId(self, token):
+        """Convert a Token to Id. Lookup of self.vocab"""
+        raise NotImplementedError('TextTokenizer TokenToId not implemented')
+
+    def DecodeIds(self, Ids):
+        """Convert a list or tokenization object of Ids to a text string"""
+        raise NotImplementedError('TextTokenizer DecodeIds not implemented')
+
+    def DecodeTokens(self, Tokens):
+        """Convert a list or tokenization object of tokens to a text string"""
+        raise NotImplementedError('TextTokenizer DecodeTokens not implemented')
+
+
+class CharacterLevelTokenizer(TextTokenizer):
+    """
+    Text tokenizer for ASCII-256 Character Level Tokenization.
+    """
+
+    def __init__(self, **kwargs):
+        self.num_text_tokens = 256
+        super(CharacterLevelTokenizer, self).__init__()
+        self._tokens = [
+            self.IdToToken(Id) for Id in range(self.num_text_tokens)
+        ]
+        self._vocab = {t: i for i, t in enumerate(self._tokens)}
+
+    def __len__(self):
+        return 256
+
+    @staticmethod
+    def exists(model_path):
+        return True
+
+    def Train(self, corpus):
+        pass
+
+    @property
+    def tokens(self):
+        return self._tokens
+
+    @property
+    def vocab(self):
+        return self._vocab
+
+    def EncodeAsIds(self, text, process_fn=None):
+        """convert text to ascii 256 Ids"""
+        processed_text = text
+        if process_fn is not None:
+            processed_text = process_fn(processed_text)
+            processed_text = str(processed_text)
+        tokens = [self.TokenToId(c) for c in processed_text]
+        return Tokenization(tokens, processed_text, text)
+
+    def EncodeAsTokens(self, text, process_fn=None):
+        """convert text to ascii 256 characters"""
+        processed_text = text
+        if process_fn is not None:
+            processed_text = process_fn(processed_text)
+        processed_text = str(processed_text)
+        tokens = [c for c in processed_text]
+        return Tokenization(tokens, processed_text, text, asIds=False)
+
+    def IdToToken(self, Id):
+        """ascii index to character"""
+        return chr(Id)
+
+    def TokenToId(self, token):
+        """ascii character to index"""
+        return ord(token)
+
+    def DecodeIds(self, Ids):
+        """converts ascii ids to tokens before joining them into text"""
+        if isinstance(Ids, Tokenization):
+            Ids = Ids.tokenization
+        return ''.join([self.IdToToken(tok) for tok in Ids])
+
+    def DecodeTokens(self, Tokens):
+        """just concatenates ascii tokens into text"""
+        if isinstance(Tokens, Tokenization):
+            Tokens = Tokens.tokenization
+        return ''.join(Tokens)
+
+
+MAX_SENTENCEPIECE_SENTENCES = 100000000
+
+
+def get_corpus_freq(dataset, filepath, filetype='tsv'):
+    """
+    Take corpus, split it into sentences, and extract word frequencies.
+    Write frequencies to `filepath` as a tsv. Only write the first
+    MAX_SENTENCEPIECE_SENTENCES most common words to the file.
+    """
+    nltk.download('punkt', download_dir='./nltk')
+    if filetype == 'tsv':
+        delimiter = '\t'
+    else:
+        delimiter = ','
+
+    print('compute corpus frequency\n', flush=True)
+
+    total_sentence_count = 0
+    maxlen = 0
+    freqs = {}
+    for entry in dataset:
+        if isinstance(entry, dict):
+            entry = entry['text']
+        lines = entry.strip().split('\n')
+        for line in lines:
+            sentences = nltk_tokenize.sent_tokenize(line)
+            total_sentence_count += len(sentences)
+            for sentence in sentences:
+                maxlen = max(len(line), maxlen)
+                for word in sentence.split():
+                    if word not in freqs:
+                        freqs[word] = 0
+                    freqs[word] += 1
+
+    print('length of freqs before truncating ' + str(len(freqs)), flush=True)
+    print('file path for freq ' + str(filepath), flush=True)
+
+    freqs_sorted = {}
+    counter = 0
+    for word, count in sorted(freqs.items(), key=lambda x: x[1], reverse=True):
+        if counter >= MAX_SENTENCEPIECE_SENTENCES:
+            break
+        counter += 1
+        freqs_sorted[word] = count
+
+    print(
+        'length of freqs after trancating ' + str(len(freqs_sorted)),
+        flush=True)
+
+    with open(filepath, 'w') as f:
+        writer = csv.writer(f, delimiter=delimiter)
+        for k, v in freqs_sorted.items():
+            writer.writerow([str(k), str(v)])
+
+    return total_sentence_count, maxlen
+
+
+class SentencePieceTokenizer(TextTokenizer):
+    """Trains and uses sentencepiece for text tokenization"""
+
+    def __init__(self,
+                 model_type='bpe',
+                 vocab_size=None,
+                 corpus=None,
+                 model_path=None,
+                 character_coverage=1.0,
+                 **kwargs):
+        self.character_coverage = character_coverage
+        self.model_type = model_type.lower()
+        self.spm_model = model_path
+        self.num_text_tokens = vocab_size
+        make_train = not SentencePieceTokenizer.exists(self.spm_model)
+        if make_train:
+            assert corpus is not None and self.num_text_tokens is not None
+            self.Train(corpus, self.num_text_tokens)
+        self._tokens = []
+        self._vocab = {}
+        self.load_spm_model()
+        super(SentencePieceTokenizer, self).__init__()
+
+    def __len__(self):
+        return self.num_text_tokens
+
+    @property
+    def tokens(self):
+        return self._tokens
+
+    @property
+    def vocab(self):
+        return self._vocab
+
+    @staticmethod
+    def exists(model_path):
+        if model_path is None:
+            return False
+        # check if path exists
+        dne = not os.path.exists(model_path)
+        # check if path.model exists
+        if dne and not model_path.endswith('.model'):
+            dne = not os.path.exists(model_path + '.model')
+        return not dne
+
+    def load_spm_model(self):
+        """load sentencepiece model and parse vocab"""
+        if not os.path.exists(
+                self.spm_model) and not self.spm_model.endswith('.model'):
+            self.spm_model = self.spm_model + '.model'
+        self.sp = spm.SentencePieceProcessor()
+        self.sp.Load(self.spm_model)
+        self.vocab_size = self.num_text_tokens = len(self.sp)
+        self._tokens = [self.IdToToken(t) for t in range(self.vocab_size)]
+        self._vocab = {t: i for i, t in enumerate(self._tokens)}
+
+    def Train(self, corpus, num_text_tokens):
+        """train sentencepiece model on corpus using word frequencies"""
+        self.num_text_tokens = num_text_tokens
+        use_model_path = self.spm_model
+        random_hash = str(random.randint(0, 2147483647))
+        if use_model_path is None:
+            use_model_path = random_hash
+        if use_model_path.endswith('.model'):
+            use_model_path = use_model_path[:use_model_path.rfind('.model')]
+        input_path = use_model_path + '.tsv.' + random_hash
+        line_count, maxlenline = get_corpus_freq(corpus, input_path)
+        line_count = min(line_count, MAX_SENTENCEPIECE_SENTENCES)
+        print(
+            'line count used as input_sentence_size ', line_count, flush=True)
+        print('training sentencepiece model', flush=True)
+        train_string = '--input={file_path} --model_prefix={model_prefix} --vocab_size={vocab_size}' \
+                       + ' --model_type={model_type} --character_coverage={character_coverage} ' \
+                       + '--input_sentence_size={input_sentence_size} ' \
+                       + '--input_format=tsv'
+        train_string = train_string.format(
+            file_path=input_path,
+            model_prefix=use_model_path,
+            vocab_size=num_text_tokens,
+            model_type=self.model_type,
+            character_coverage=self.character_coverage,
+            input_sentence_size=int(line_count))  # , #)#,
+        print(
+            'calling spm.SentencePieceTrainer.Train(%s)' % (train_string),
+            flush=True)
+        spm.SentencePieceTrainer.Train(train_string)
+        os.remove(input_path)
+        self.spm_model = use_model_path + '.model'
+        print('sentencepiece model written to ' + self.spm_model, flush=True)
+
+    def EncodeAsIds(self, text, process_fn=None):
+        """convert text to sentencepiece Ids"""
+        processed_text = text
+        if process_fn is not None:
+            processed_text = process_fn(processed_text)
+        tokens = self.sp.EncodeAsIds(processed_text)
+        return Tokenization(tokens, processed_text, text)
+
+    def EncodeAsTokens(self, text, process_fn=None):
+        """convert text to sentencepiece tokens"""
+        processed_text = text
+        if process_fn is not None:
+            processed_text = process_fn(processed_text)
+        tokens = self.sp.EncodeAsTokens(processed_text)
+        return Tokenization(tokens, processed_text, text, asIds=False)
+
+    def IdToToken(self, Id):
+        """convert Id to sentencpiece token"""
+        return self.sp.IdToPiece(Id)
+
+    def TokenToId(self, token):
+        """convert sentencpiece token to Id"""
+        return self.sp.PieceToId(token)
+
+    def DecodeIds(self, Ids):
+        """converts ids to a text string"""
+        if isinstance(Ids, Tokenization):
+            Ids = Ids.tokenization
+        return self.sp.DecodeIds(Ids)
+
+    def DecodeTokens(self, Tokens):
+        """converts sentencepiece tokens to a text string"""
+        if isinstance(Tokens, Tokenization):
+            Tokens = Tokens.tokenization
+        return self.sp.DecodeTokens(Tokens)
+
+
+class BertWordPieceTokenizer(Tokenizer):
+    """
+    Loads a pretrained WordPiece tokenizer from `cache_dir` for tokenization
+    in BERT training. Default to bert-large-uncased tokenizer.
+    """
+
+    def __init__(self,
+                 tokenizer_model_type=None,
+                 cache_dir=None,
+                 add_block_symbols=False,
+                 add_sentinel_token=0,
+                 add_task_mask=False,
+                 add_decoder_mask=False,
+                 **kwargs):
+        # default to bert-large-uncased tokenizer
+        if tokenizer_model_type not in PRETRAINED_VOCAB_ARCHIVE_MAP:
+            tokenizer_model_type = 'bert-large-uncased'
+        if not torch.distributed.is_initialized(
+        ) or torch.distributed.get_rank() == 0:
+            print('loading BertWordPieceTokenizer (', tokenizer_model_type,
+                  ') from cache_dir ', cache_dir)
+        do_lower_case = not ('-cased' in tokenizer_model_type
+                             or 'chinese' in tokenizer_model_type)
+        self.text_tokenizer = BertTokenizer.from_pretrained(
+            tokenizer_model_type,
+            do_lower_case=do_lower_case,
+            cache_dir=cache_dir)
+        if not torch.distributed.is_initialized(
+        ) or torch.distributed.get_rank() == 0:
+            print('loaded', tokenizer_model_type)
+        # disable max len warnings by increasing max len
+        self.text_tokenizer.max_len = int(1e12)
+
+        # set command tokens from wordpiece tokenizer values
+        self.num_command_tokens = 6
+        self.num_tokens = len(self.text_tokenizer.vocab)
+        self.num_text_tokens = self.num_tokens - 5
+        self.num_type_tokens = 2
+
+        self._command_tokens = [
+            CommandToken('pad', '[PAD]', self.text_tokenizer.vocab['[PAD]']),
+            CommandToken('ENC', '[CLS]', self.text_tokenizer.vocab['[CLS]']),
+            CommandToken('MASK', '[MASK]',
+                         self.text_tokenizer.vocab['[MASK]']),
+            CommandToken('unk', '[UNK]', self.text_tokenizer.vocab['[UNK]']),
+            CommandToken('sep', '[SEP]', self.text_tokenizer.vocab['[SEP]']),
+            CommandToken('eos', '[PAD]', self.text_tokenizer.vocab['[PAD]']),
+        ]
+        if add_block_symbols:
+            self._command_tokens.extend([
+                CommandToken('sop', '<|startofpiece|>', self.num_tokens),
+                CommandToken('eop', '<|endofpiece|>', self.num_tokens + 1)
+            ])
+            self.num_tokens += 2
+            self.num_command_tokens += 2
+            if add_task_mask:
+                self._command_tokens.extend([
+                    CommandToken('gMASK', '[gMASK]', self.num_tokens),
+                    CommandToken('sMASK', '[sMASK]', self.num_tokens + 1)
+                ])
+                self.num_tokens += 2
+                self.num_command_tokens += 2
+            if add_decoder_mask:
+                self._command_tokens.extend(
+                    [CommandToken('dBLOCK', '[dBLOCK]', self.num_tokens)])
+                self.num_tokens += 1
+                self.num_command_tokens += 1
+        if add_sentinel_token > 0:
+            for i in range(1, add_sentinel_token):
+                self._command_tokens.extend([
+                    CommandToken(f'MASK{i}', f'[MASK{i}]', self.num_tokens),
+                    CommandToken(f'sop{i}', f'<|startofpiece{i}|>',
+                                 self.num_tokens + 1)
+                ])
+                self.num_tokens += 2
+                self.num_command_tokens += 2
+        self.command_name_map = {tok.name: tok for tok in self._command_tokens}
+        self.command_token_map = {
+            tok.token: tok
+            for tok in self._command_tokens
+        }
+        self.command_id_map = {tok.Id: tok for tok in self._command_tokens}
+
+        # set type tokens
+        self.type_tokens = [
+            TypeToken('str0', '<str0>', 0),
+            TypeToken('str1', '<str1>', 1),
+        ]
+        self.type_name_map = {tok.name: tok for tok in self.type_tokens}
+        self.type_token_map = {tok.token: tok for tok in self.type_tokens}
+        self.type_id_map = {tok.Id: tok for tok in self.type_tokens}
+
+        # parse tokens and vocabs from tokenizer
+
+        self._tokens = list(self.text_tokenizer.vocab.keys())
+        self._vocab = {k: v for k, v in self.text_tokenizer.vocab.items()}
+
+        self._text_tokens = list(self._tokens)
+        self._text_token_vocab = {
+            k: v
+            for k, v in self.text_tokenizer.vocab.items()
+        }
+
+        self._command_token_tokens = list(self.command_token_map.keys())
+        self._command_token_vocab = {
+            t: Id
+            for Id, t in self.command_id_map.items()
+        }
+
+        self._token_types = list(self.type_token_map.keys())
+        self._token_type_vocab = {t: Id for Id, t in self.type_id_map.items()}
+
+    def _encode(self, text):
+        tokens = self.text_tokenizer.tokenize(text)
+        ids = self.text_tokenizer.convert_tokens_to_ids(tokens)
+        return ids
+
+    def EncodeAsTokens(self, text, process_fn=None):
+        """convert wordpiece token to Id"""
+        processed_text = text
+        if process_fn is not None:
+            processed_text = process_fn(processed_text)
+        tokens = self.text_tokenizer.tokenize(processed_text)
+        return Tokenization(tokens, processed_text, text, asIds=False)
+
+    def IdToToken(self, Id, type_token=False):
+        """convert Id to sentencpiece token"""
+        if isinstance(Id, (TypeToken, CommandToken)):
+            return Id.token
+        if type_token:
+            return self.type_id_map[Id].token
+        if Id in self.command_id_map:
+            return self.command_id_map[Id].token
+        return self.text_tokenizer.ids_to_tokens[Id]
+
+    def TokenToId(self, token, type_token=False):
+        """convert sentencpiece token to Id"""
+        if isinstance(token, (TypeToken, CommandToken)):
+            return token.Id
+        if type_token:
+            return self.type_token_map[token].Id
+        return self.text_tokenizer.vocab[token]
+
+    def DecodeIds(self, Ids, type_token=False):
+        """converts ids to wordpiece tokens and joins them as a text string"""
+        if type_token:
+            return ' '.join(
+                Id.token if isinstance(Id, TypeToken) else self.
+                type_id_map[Id].token for Id in Ids)
+        if isinstance(Ids, Tokenization):
+            Ids = Ids.tokenization
+        Tokens = []
+        for Id in Ids:
+            if Id in self.command_id_map:
+                Tokens.append(self.command_id_map[Id].token)
+            elif Id in self.text_tokenizer.ids_to_tokens:
+                Tokens.append(self.text_tokenizer.ids_to_tokens[Id])
+        new_tokens = []
+        for token in Tokens:
+            if token.startswith('##') and len(new_tokens) > 0:
+                new_tokens[-1] += token[2:]
+            else:
+                new_tokens.append(token)
+        return ' '.join(new_tokens)
+
+    def DecodeTokens(self, Tokens, type_token=False):
+        """converts wordpiece tokens to a text string"""
+        if type_token:
+            return ' '.join(
+                t.token if isinstance(t, TypeToken) else t for t in Tokens)
+        if isinstance(Tokens, Tokenization):
+            Tokens = Tokens.tokenization
+        return ' '.join(Tokens)
+
+
+class GPT2BPETokenizer(Tokenizer):
+
+    def __init__(self,
+                 model_type_or_path,
+                 cache_dir=None,
+                 add_block_symbols=False,
+                 add_task_mask=False,
+                 add_decoder_mask=False,
+                 **kwargs):
+        self.text_tokenizer = GPT2Tokenizer.from_pretrained(
+            model_type_or_path, cache_dir=cache_dir)
+
+        # disable max len warnings by increasing max len
+        self.text_tokenizer.max_len = int(1e12)
+        self.num_tokens = len(self.text_tokenizer.encoder)
+        self.num_type_tokens = 2
+        if model_type_or_path.startswith('roberta'):
+            self.num_command_tokens = 6
+            self.num_text_tokens = self.num_tokens - 3
+            self._command_tokens = [
+                CommandToken('pad', '<|endoftext|>',
+                             self.text_tokenizer.encoder['</s>']),
+                CommandToken('eos', '<|endoftext|>',
+                             self.text_tokenizer.encoder['</s>']),
+                CommandToken('sep', '[SEP]',
+                             self.text_tokenizer.encoder['</s>']),
+                CommandToken('ENC', '[CLS]',
+                             self.text_tokenizer.encoder['<s>']),
+                CommandToken(
+                    'MASK',
+                    '[MASK]',
+                    self.text_tokenizer.encoder['<mask>'],
+                    lstrip=True),
+                CommandToken('unk', '[UNK]',
+                             self.text_tokenizer.encoder['<unk>'])
+            ]
+            if add_block_symbols:
+                self._command_tokens.extend([
+                    CommandToken('sop', '<|startofpiece|>', self.num_tokens),
+                    CommandToken('eop', '<|endofpiece|>', self.num_tokens + 1)
+                ])
+                self.num_tokens += 2
+                self.num_command_tokens += 2
+        else:
+            self.num_command_tokens = 2
+            self.num_text_tokens = self.num_tokens - 1
+            self._command_tokens = [
+                CommandToken('pad', '<|endoftext|>',
+                             self.text_tokenizer.encoder['<|endoftext|>']),
+                CommandToken('eos', '<|endoftext|>',
+                             self.text_tokenizer.encoder['<|endoftext|>'])
+            ]
+            if add_block_symbols:
+                self._command_tokens.extend([
+                    CommandToken('sop', '<|startofpiece|>', self.num_tokens),
+                    CommandToken('eop', '<|endofpiece|>', self.num_tokens + 1),
+                    CommandToken('ENC', '[CLS]', self.num_tokens + 2),
+                    CommandToken(
+                        'MASK', '[MASK]', self.num_tokens + 3, lstrip=True),
+                    CommandToken('sep', '[SEP]', self.num_tokens + 4),
+                    CommandToken('unk', '[UNK]', self.num_tokens + 5)
+                ])
+                self.num_tokens += 6
+                self.num_command_tokens += 6
+        if add_block_symbols:
+            if add_task_mask:
+                self._command_tokens.extend([
+                    CommandToken(
+                        'gMASK', '[gMASK]', self.num_tokens, lstrip=True),
+                    CommandToken(
+                        'sMASK', '[sMASK]', self.num_tokens + 1, lstrip=True)
+                ])
+                self.num_tokens += 2
+                self.num_command_tokens += 2
+            if add_decoder_mask:
+                self._command_tokens.extend(
+                    [CommandToken('dBLOCK', '[dBLOCK]', self.num_tokens)])
+                self.num_tokens += 1
+                self.num_command_tokens += 1
+        self.command_name_map = {tok.name: tok for tok in self._command_tokens}
+        self.command_token_map = {
+            tok.token: tok
+            for tok in self._command_tokens
+        }
+        self.command_id_map = {tok.Id: tok for tok in self._command_tokens}
+
+        self.type_tokens = [
+            TypeToken('str0', '<str0>', 0),
+            TypeToken('str1', '<str1>', 1),
+        ]
+        self.type_name_map = {tok.name: tok for tok in self.type_tokens}
+        self.type_token_map = {tok.token: tok for tok in self.type_tokens}
+        self.type_id_map = {tok.Id: tok for tok in self.type_tokens}
+
+        self._tokens = list(self.text_tokenizer.encoder.keys())
+        self._vocab = {k: v for k, v in self.text_tokenizer.encoder.items()}
+
+        self._text_tokens = list(self._tokens)
+        self._text_token_vocab = {
+            k: v
+            for k, v in self.text_tokenizer.encoder.items()
+        }
+
+        self._command_token_tokens = list(self.command_token_map.keys())
+        self._command_token_vocab = {
+            t: Id
+            for Id, t in self.command_id_map.items()
+        }
+
+        self._token_types = list(self.type_token_map.keys())
+        self._token_type_vocab = {t: Id for Id, t in self.type_id_map.items()}
+
+        for idx, tok in self.command_id_map.items():
+            self.text_tokenizer.decoder[idx] = tok.token
+
+    def EncodeAsIds(self, text, process_fn=None):
+        processed_text = text
+        if process_fn is not None:
+            processed_text = process_fn(processed_text)
+
+        def split_on_token(tok_extended: CommandToken, text):
+            result = []
+            tok = tok_extended.token
+            split_text = text.split(tok)
+            for i, sub_text in enumerate(split_text):
+                # CommandToken can control whitespace stripping around them.
+                # We use them for GPT2 and Roberta to have different behavior depending on the special token
+                # Cf. https://github.com/huggingface/transformers/pull/2778
+                # and https://github.com/huggingface/transformers/issues/3788
+                # Strip white spaces on the right
+                if tok_extended.rstrip and i > 0:
+                    # A bit counter-intuitive but we strip the left of the string
+                    # since tok_extended.rstrip means the special token is eating all white spaces on its right
+                    sub_text = sub_text.lstrip()
+                # Strip white spaces on the left
+                if tok_extended.lstrip and i < len(split_text) - 1:
+                    sub_text = sub_text.rstrip()  # Opposite here
+
+                if i == 0 and not sub_text:
+                    result.append(tok)
+                elif i == len(split_text) - 1:
+                    if sub_text:
+                        result.append(sub_text)
+                    else:
+                        pass
+                else:
+                    if sub_text:
+                        result.append(sub_text)
+                    result.append(tok)
+            return result
+
+        def split_on_tokens(tok_list, text):
+            if not text.strip():
+                return []
+            if not tok_list:
+                return self.text_tokenizer.encode(text)
+
+            tokenized_text = []
+            text_list = [text]
+            for tok in tok_list:
+                tokenized_text = []
+                for sub_text in text_list:
+                    if sub_text not in self._command_token_tokens:
+                        tokenized_text.extend(split_on_token(tok, sub_text))
+                    else:
+                        tokenized_text.append(sub_text)
+                text_list = tokenized_text
+
+            return list(
+                itertools.chain.from_iterable(
+                    (self.text_tokenizer.encode(token)
+                     if token not in self._command_token_tokens else
+                     [self.command_token_map[token].Id]
+                     for token in tokenized_text)))
+
+        no_split_tokens = self._command_tokens
+        Ids = split_on_tokens(no_split_tokens, processed_text)
+        tokenization = Tokenization(Ids, processed_text, text)
+        tokenization.set_command_tokens(self._command_tokens)
+        return tokenization
+
+    def _encode(self, text):
+        return self.text_tokenizer.encode(text)
+
+    def EncodeAsTokens(self, text, process_fn=None):
+        processed_text = text
+        if process_fn is not None:
+            processed_text = process_fn(processed_text)
+        tokens = []
+        for token in re.findall(self.text_tokenizer.pat, processed_text):
+            token = ''.join(self.text_tokenizer.bye_encoder[b]
+                            for b in token.encode('utf-8'))
+            tokens.extend(
+                bpe_token
+                for bpe_token in self.text_tokenizer.bpe(token).split(' '))
+        tokenization = Tokenization(tokens, processed_text, text, asIds=False)
+        tokenization.set_command_tokens(self._command_tokens)
+        return tokenization
+
+    def DecodeAsTokens(self, Ids):
+        return [self.IdToToken(x) for x in Ids]
+
+    def IdToToken(self, Id, type_token=False):
+        if isinstance(Id, (TypeToken, CommandToken)):
+            return Id.token
+        if type_token:
+            return self.type_id_map[Id].token
+        if Id in self.command_id_map:
+            return self.command_id_map[Id].token
+        return self.text_tokenizer.decoder[Id]
+
+    def TokenToId(self, token, type_token=False):
+        if isinstance(token, (TypeToken, CommandToken)):
+            return token.Id
+        if type_token:
+            return self.type_token_map[token].Id
+        return self.text_tokenizer.encoder[token]
+
+    def DecodeIds(self, Ids, type_token=False):
+        if type_token:
+            return ' '.join(
+                Id.token if isinstance(Id, TypeToken) else self.
+                type_id_map[Id].token for Id in Ids)
+        if isinstance(Ids, Tokenization):
+            Ids = Ids.tokenization
+        return self.text_tokenizer.decode(Ids)
+
+    def DecodeTokens(self, Tokens, type_token=False):
+        if type_token:
+            return ' '.join(
+                t.token if isinstance(t, TypeToken) else t for t in Tokens)
+        if isinstance(Tokens, Tokenization):
+            Tokens = Tokens.tokenization
+        return self.text_tokenizer.decode(
+            [self.TokenToId(tok) for tok in Tokens])
+
+
+class ChineseSPTokenizer(Tokenizer):
+
+    def __init__(self,
+                 model_path,
+                 add_block_symbols=False,
+                 add_task_mask=False,
+                 add_decoder_mask=False,
+                 **kwargs):
+        self.text_tokenizer = sp_tokenizer.from_pretrained(model_path)
+
+        self.num_command_tokens = 0
+        self.num_text_tokens = self.text_tokenizer.sp.vocab_size()
+        self.num_tokens = self.num_text_tokens
+        self.num_type_tokens = 2
+
+        self._command_tokens = [
+            CommandToken('pad', '<|endoftext|>', self.num_text_tokens),
+            CommandToken('eos', '<|endoftext|>', self.num_text_tokens),
+            CommandToken('sep', '[SEP]', self.num_text_tokens + 1),
+            CommandToken('ENC', '[CLS]', self.num_text_tokens + 2),
+            CommandToken(
+                'MASK', '[MASK]', self.num_text_tokens + 3, lstrip=True),
+            CommandToken('unk', '[UNK]', self.num_text_tokens + 4)
+        ]
+        self.num_tokens += 5
+        self.num_command_tokens += 6
+        if add_block_symbols:
+            self._command_tokens.extend([
+                CommandToken('sop', '<|startofpiece|>', self.num_tokens + 1),
+                CommandToken('eop', '<|endofpiece|>', self.num_tokens + 2)
+            ])
+            self.num_tokens += 2
+            self.num_command_tokens += 2
+            if add_task_mask:
+                self._command_tokens.extend([
+                    CommandToken(
+                        'gMASK', '[gMASK]', self.num_tokens, lstrip=True),
+                    CommandToken(
+                        'sMASK', '[sMASK]', self.num_tokens + 1, lstrip=True)
+                ])
+                self.num_tokens += 2
+                self.num_command_tokens += 2
+            if add_decoder_mask:
+                self._command_tokens.extend(
+                    [CommandToken('dBLOCK', '[dBLOCK]', self.num_tokens)])
+                self.num_tokens += 1
+                self.num_command_tokens += 1
+        self.command_name_map = {tok.name: tok for tok in self._command_tokens}
+        self.command_token_map = {
+            tok.token: tok
+            for tok in self._command_tokens
+        }
+        self.command_id_map = {tok.Id: tok for tok in self._command_tokens}
+
+        self.type_tokens = [
+            TypeToken('str0', '<str0>', 0),
+            TypeToken('str1', '<str1>', 1),
+        ]
+        self.type_name_map = {tok.name: tok for tok in self.type_tokens}
+        self.type_token_map = {tok.token: tok for tok in self.type_tokens}
+        self.type_id_map = {tok.Id: tok for tok in self.type_tokens}
+
+        # self._tokens = list(self.text_tokenizer.encoder.keys())
+        # self._vocab = {k:v for k,v in self.text_tokenizer.encoder.items()}
+        #
+        # self._text_tokens = list(self._tokens)
+        # self._text_token_vocab = {k:v for k,v in self.text_tokenizer.encoder.items()}
+
+        self._command_token_tokens = list(self.command_token_map.keys())
+        self._command_token_vocab = {
+            t: Id
+            for Id, t in self.command_id_map.items()
+        }
+
+        self._token_types = list(self.type_token_map.keys())
+        self._token_type_vocab = {t: Id for Id, t in self.type_id_map.items()}
+
+    def _encode(self, text):
+        ids = self.text_tokenizer.encode(text)
+        return ids
+
+    def EncodeAsTokens(self, text, process_fn=None):
+        processed_text = text
+        if process_fn is not None:
+            processed_text = process_fn(processed_text)
+        tokens = self.text_tokenizer.tokenize(processed_text)
+        tokenization = Tokenization(tokens, processed_text, text, asIds=False)
+        tokenization.set_command_tokens(self._command_tokens)
+        return tokenization
+        # return Tokenization(tokens, processed_text, text, asIds=False)
+
+    def IdToToken(self, Id, type_token=False):
+        if isinstance(Id, (TypeToken, CommandToken)):
+            return Id.token
+        if type_token:
+            return self.type_id_map[Id].token
+        if Id in self.command_id_map:
+            return self.command_id_map[Id].token
+        elif Id in self.type_id_map:
+            return self.type_id_map[Id].token
+        else:
+            return self.text_tokenizer.convert_id_to_token(int(Id))
+
+    def TokenToId(self, token, type_token=False):
+        if isinstance(token, (TypeToken, CommandToken)):
+            return token.Id
+        if type_token:
+            return self.type_token_map[token].Id
+        return self.text_tokenizer.convert_token_to_id(token)
+
+    def DecodeIds(self, Ids, type_token=False):
+        if type_token:
+            return ' '.join(
+                Id.token if isinstance(Id, TypeToken) else self.
+                type_id_map[Id].token for Id in Ids)
+        if isinstance(Ids, Tokenization):
+            Ids = Ids.tokenization
+        Ids = list(map(int, Ids))
+        pieces = []
+        last = 0
+        for i, token_id in enumerate(Ids):
+            if token_id in self.command_id_map:
+                pieces.append(Ids[last:i])
+                pieces.append(token_id)
+                last = i + 1
+        pieces.append(Ids[last:])
+        text = ''
+        for piece in pieces:
+            if isinstance(piece, int):
+                text += self.command_id_map[piece].token
+            elif piece:
+                text += self.text_tokenizer.decode(piece)
+        return text
+
+    def DecodeTokens(self, Tokens, type_token=False):
+        if type_token:
+            return ' '.join(
+                t.token if isinstance(t, TypeToken) else t for t in Tokens)
+        if isinstance(Tokens, Tokenization):
+            Tokens = Tokens.tokenization
+        return self.text_tokenizer.decode(
+            [self.TokenToId(tok) for tok in Tokens])
diff --git a/modelscope/models/nlp/mglm/data_utils/tokenization_gpt2.py b/modelscope/models/nlp/mglm/data_utils/tokenization_gpt2.py
new file mode 100644
index 00000000..d179e055
--- /dev/null
+++ b/modelscope/models/nlp/mglm/data_utils/tokenization_gpt2.py
@@ -0,0 +1,359 @@
+# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for OpenAI GPT."""
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+import logging
+import os
+import sys
+from io import open
+
+import json
+import regex as re
+
+from .file_utils import cached_path
+
+try:
+    from functools import lru_cache
+except ImportError:
+    # Just a dummy decorator to get the checks to run on python2
+    # because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now.
+    def lru_cache():
+        return lambda func: func
+
+
+logger = logging.getLogger(__name__)
+
+PRETRAINED_VOCAB_ARCHIVE_MAP = {
+    'gpt2': '.pytorch_pretrained_bert/gpt2-vocab.json',
+    'roberta': '.pytorch_pretrained_bert/roberta-vocab.json'
+}
+PRETRAINED_MERGES_ARCHIVE_MAP = {
+    'gpt2': '.pytorch_pretrained_bert/gpt2-merges.txt',
+    'roberta': '.pytorch_pretrained_bert/roberta-merges.txt'
+}
+PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
+    'gpt2': 1024,
+}
+VOCAB_NAME = 'vocab.json'
+MERGES_NAME = 'merges.txt'
+SPECIAL_TOKENS_NAME = 'special_tokens.txt'
+
+
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    _chr = unichr if sys.version_info[0] == 2 else chr
+    bs = list(range(ord('!'),
+                    ord('~') + 1)) + list(range(
+                        ord('¡'),
+                        ord('¬') + 1)) + list(range(ord('®'),
+                                                    ord('ÿ') + 1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [_chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+class GPT2Tokenizer(object):
+    """
+    GPT-2 BPE tokenizer. Peculiarities:
+        - Byte-level BPE
+    """
+
+    @classmethod
+    def from_pretrained(cls,
+                        pretrained_model_name_or_path,
+                        cache_dir=None,
+                        *inputs,
+                        **kwargs):
+        """
+        Instantiate a PreTrainedBertModel from a pre-trained model file.
+        Download and cache the pre-trained model file if needed.
+        """
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
+            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[
+                pretrained_model_name_or_path]
+            merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[
+                pretrained_model_name_or_path]
+            special_tokens_file = None
+        else:
+            vocab_file = os.path.join(pretrained_model_name_or_path,
+                                      VOCAB_NAME)
+            merges_file = os.path.join(pretrained_model_name_or_path,
+                                       MERGES_NAME)
+            special_tokens_file = os.path.join(pretrained_model_name_or_path,
+                                               SPECIAL_TOKENS_NAME)
+            if not os.path.exists(special_tokens_file):
+                special_tokens_file = None
+            else:
+                logger.info('loading special tokens file {}'.format(
+                    special_tokens_file))
+        # redirect to the cache, if necessary
+        # try:
+        #     resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
+        #     resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir)
+        # except EnvironmentError:
+        #     logger.error(
+        #         "Model name '{}' was not found in model name list ({}). "
+        #         "We assumed '{}' was a path or url but couldn't find files {} and {} "
+        #         "at this path or url.".format(
+        #             pretrained_model_name_or_path,
+        #             ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
+        #             pretrained_model_name_or_path,
+        #             vocab_file, merges_file))
+        #     return None
+        # if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file:
+        #     logger.info("loading vocabulary file {}".format(vocab_file))
+        #     logger.info("loading merges file {}".format(merges_file))
+        # else:
+        #     logger.info("loading vocabulary file {} from cache at {}".format(
+        #         vocab_file, resolved_vocab_file))
+        #     logger.info("loading merges file {} from cache at {}".format(
+        #         merges_file, resolved_merges_file))
+        resolved_vocab_file = vocab_file
+        resolved_merges_file = merges_file
+        logger.info('loading vocabulary file {}'.format(vocab_file))
+        logger.info('loading merges file {}'.format(merges_file))
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
+            # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
+            # than the number of positional embeddings
+            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[
+                pretrained_model_name_or_path]
+            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
+        # Instantiate tokenizer.
+        if special_tokens_file and 'special_tokens' not in kwargs:
+            special_tokens = open(
+                special_tokens_file, encoding='utf-8').read().split('\n')[:-1]
+        else:
+            special_tokens = kwargs.pop('special_tokens', [])
+        tokenizer = cls(
+            resolved_vocab_file,
+            resolved_merges_file,
+            special_tokens=special_tokens,
+            *inputs,
+            **kwargs)
+        return tokenizer
+
+    def __init__(self,
+                 vocab_file,
+                 merges_file,
+                 errors='replace',
+                 special_tokens=None,
+                 max_len=None):
+        self.max_len = max_len if max_len is not None else int(1e12)
+        self.encoder = json.load(open(vocab_file))
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.errors = errors  # how to handle errors in decoding
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
+        bpe_merges = [tuple(merge.split()) for merge in bpe_data]
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+        self.cache = {}
+
+        # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
+        self.pat = re.compile(
+            r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
+        )
+
+        self.special_tokens = {}
+        self.special_tokens_decoder = {}
+        self.set_special_tokens(special_tokens)
+
+    def __len__(self):
+        return len(self.encoder) + len(self.special_tokens)
+
+    def set_special_tokens(self, special_tokens):
+        """ Add a list of additional tokens to the encoder.
+            The additional tokens are indexed starting from the last index of the
+            current vocabulary in the order of the `special_tokens` list.
+        """
+        if not special_tokens:
+            self.special_tokens = {}
+            self.special_tokens_decoder = {}
+            return
+        self.special_tokens = dict((tok, len(self.encoder) + i)
+                                   for i, tok in enumerate(special_tokens))
+        self.special_tokens_decoder = {
+            v: k
+            for k, v in self.special_tokens.items()
+        }
+        logger.info('Special tokens {}'.format(self.special_tokens))
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token)
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token
+
+        while True:
+            bigram = min(
+                pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:  # noqa
+                    new_word.extend(word[i:])
+                    break
+
+                if word[i] == first and i < len(word) - 1 and word[
+                        i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+
+    def tokenize(self, text):
+        """ Tokenize a string. """
+        bpe_tokens = []
+        for token in re.findall(self.pat, text):
+            if sys.version_info[0] == 2:
+                token = ''.join(self.byte_encoder[ord(b)] for b in token)
+            else:
+                token = ''.join(self.byte_encoder[b]
+                                for b in token.encode('utf-8'))
+            bpe_tokens.extend(
+                bpe_token for bpe_token in self.bpe(token).split(' '))
+        return bpe_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        """ Converts a sequence of tokens into ids using the vocab. """
+        ids = []
+        if isinstance(tokens, str) or (sys.version_info[0] == 2
+                                       and isinstance(tokens, unicode)):
+            if tokens in self.special_tokens:
+                return self.special_tokens[tokens]
+            else:
+                return self.encoder.get(tokens, 0)
+        for token in tokens:
+            if token in self.special_tokens:
+                ids.append(self.special_tokens[token])
+            else:
+                ids.append(self.encoder.get(token, 0))
+        if len(ids) > self.max_len:
+            logger.warning(
+                'Token indices sequence length is longer than the specified maximum '
+                ' sequence length for this OpenAI GPT model ({} > {}). Running this'
+                ' sequence through the model will result in indexing errors'.
+                format(len(ids), self.max_len))
+        return ids
+
+    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
+        """Converts a sequence of ids in BPE tokens using the vocab."""
+        tokens = []
+        for i in ids:
+            if i in self.special_tokens_decoder:
+                if not skip_special_tokens:
+                    tokens.append(self.special_tokens_decoder[i])
+            else:
+                tokens.append(self.decoder[i])
+        return tokens
+
+    def encode(self, text):
+        return self.convert_tokens_to_ids(self.tokenize(text))
+
+    def decode(self, tokens):
+        text = ''.join([self.decoder[token] for token in tokens])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode(
+            'utf-8', errors=self.errors)
+        return text
+
+    def save_vocabulary(self, vocab_path):
+        """Save the tokenizer vocabulary and merge files to a directory."""
+        if not os.path.isdir(vocab_path):
+            logger.error('Vocabulary path ({}) should be a directory'.format(
+                vocab_path))
+            return
+        vocab_file = os.path.join(vocab_path, VOCAB_NAME)
+        merge_file = os.path.join(vocab_path, MERGES_NAME)
+        special_tokens_file = os.path.join(vocab_path, SPECIAL_TOKENS_NAME)
+
+        with open(vocab_file, 'w', encoding='utf-8') as f:
+            f.write(json.dumps(self.encoder, ensure_ascii=False))
+
+        index = 0
+        with open(merge_file, 'w', encoding='utf-8') as writer:
+            writer.write(u'#version: 0.2\n')
+            for bpe_tokens, token_index in sorted(
+                    self.bpe_ranks.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        'Saving vocabulary to {}: BPE merge indices are not consecutive.'
+                        ' Please check that the tokenizer is not corrupted!'.
+                        format(merge_file))
+                    index = token_index
+                writer.write(' '.join(bpe_tokens) + u'\n')
+                index += 1
+
+        index = len(self.encoder)
+        with open(special_tokens_file, 'w', encoding='utf-8') as writer:
+            for token, token_index in sorted(
+                    self.special_tokens.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        'Saving special tokens vocabulary to {}: BPE indices are not consecutive.'
+                        ' Please check that the tokenizer is not corrupted!'.
+                        format(special_tokens_file))
+                    index = token_index
+                writer.write(token + u'\n')
+                index += 1
+
+        return vocab_file, merge_file, special_tokens_file
diff --git a/modelscope/models/nlp/mglm/data_utils/wordpiece.py b/modelscope/models/nlp/mglm/data_utils/wordpiece.py
new file mode 100755
index 00000000..1cecffbd
--- /dev/null
+++ b/modelscope/models/nlp/mglm/data_utils/wordpiece.py
@@ -0,0 +1,408 @@
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes. Provided as is from https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/tokenization.py"""  # noqa
+
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+import collections
+import logging
+import os
+import unicodedata
+from io import open
+
+from .file_utils import cached_path
+
+logger = logging.getLogger(__name__)
+
+PRETRAINED_VOCAB_ARCHIVE_MAP = {
+    'bert-base-uncased':
+    '.pytorch_pretrained_bert/bert-base-uncased-vocab.txt',
+    'bert-large-uncased':
+    '.pytorch_pretrained_bert/bert-large-uncased-vocab.txt',
+    'bert-base-cased':
+    '.pytorch_pretrained_bert/bert-base-cased-vocab.txt',
+    'bert-large-cased':
+    '.pytorch_pretrained_bert/bert-large-cased-vocab.txt',
+    'bert-base-multilingual-uncased':
+    'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt',
+    'bert-base-multilingual-cased':
+    'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt',
+    'bert-base-chinese':
+    'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt',
+}
+PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
+    'bert-base-uncased': 512,
+    'bert-large-uncased': 512,
+    'bert-base-cased': 512,
+    'bert-large-cased': 512,
+    'bert-base-multilingual-uncased': 512,
+    'bert-base-multilingual-cased': 512,
+    'bert-base-chinese': 512,
+}
+VOCAB_NAME = 'vocab.txt'
+
+
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    index = 0
+    with open(vocab_file, 'r', encoding='utf-8') as reader:
+        while True:
+            token = reader.readline()
+            if not token:
+                break
+            token = token.strip()
+            vocab[token] = index
+            index += 1
+    return vocab
+
+
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a piece of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+
+
+class BertTokenizer(object):
+    """Runs end-to-end tokenization: punctuation splitting + wordpiece"""
+
+    def __init__(self,
+                 vocab_file,
+                 do_lower_case=True,
+                 max_len=None,
+                 do_basic_tokenize=True,
+                 never_split=('[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]')):
+        """Constructs a BertTokenizer.
+
+        Args:
+          vocab_file: Path to a one-wordpiece-per-line vocabulary file
+          do_lower_case: Whether to lower case the input
+                         Only has an effect when do_wordpiece_only=False
+          do_basic_tokenize: Whether to do basic tokenization before wordpiece.
+          max_len: An artificial maximum length to truncate tokenized sequences to;
+                         Effective maximum length is always the minimum of this
+                         value (if specified) and the underlying BERT model's
+                         sequence length.
+          never_split: List of tokens which will never be split during tokenization.
+                         Only has an effect when do_wordpiece_only=False
+        """
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
+                'model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`'
+                .format(vocab_file))
+        self.vocab = load_vocab(vocab_file)
+        self.ids_to_tokens = collections.OrderedDict([
+            (ids, tok) for tok, ids in self.vocab.items()
+        ])
+        self.do_basic_tokenize = do_basic_tokenize
+        if do_basic_tokenize:
+            self.basic_tokenizer = BasicTokenizer(
+                do_lower_case=do_lower_case, never_split=never_split)
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+        self.max_len = max_len if max_len is not None else int(1e12)
+
+    def tokenize(self, text):
+        if self.do_basic_tokenize:
+            split_tokens = []
+            for token in self.basic_tokenizer.tokenize(text):
+                for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                    split_tokens.append(sub_token)
+        else:
+            split_tokens = self.wordpiece_tokenizer.tokenize(text)
+        return split_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        """Converts a sequence of tokens into ids using the vocab."""
+        ids = []
+        for token in tokens:
+            ids.append(self.vocab[token])
+        if len(ids) > self.max_len:
+            logger.warning(
+                'Token indices sequence length is longer than the specified maximum '
+                ' sequence length for this BERT model ({} > {}). Running this'
+                ' sequence through BERT will result in indexing errors'.format(
+                    len(ids), self.max_len))
+        return ids
+
+    def convert_ids_to_tokens(self, ids):
+        """Converts a sequence of ids in wordpiece tokens using the vocab."""
+        tokens = []
+        for i in ids:
+            tokens.append(self.ids_to_tokens[i])
+        return tokens
+
+    @classmethod
+    def from_pretrained(cls,
+                        pretrained_model_name_or_path,
+                        cache_dir=None,
+                        *inputs,
+                        **kwargs):
+        """
+        Instantiate a PreTrainedBertModel from a pre-trained model file.
+        Download and cache the pre-trained model file if needed.
+        """
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
+            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[
+                pretrained_model_name_or_path]
+        else:
+            vocab_file = pretrained_model_name_or_path
+        if os.path.isdir(vocab_file):
+            vocab_file = os.path.join(vocab_file, VOCAB_NAME)
+        # redirect to the cache, if necessary
+        try:
+            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            logger.error(
+                "Model name '{}' was not found in model name list ({}). "
+                "We assumed '{}' was a path or url but couldn't find any file "
+                'associated to this path or url.'.format(
+                    pretrained_model_name_or_path,
+                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
+                    vocab_file))
+            return None
+        if resolved_vocab_file == vocab_file:
+            logger.info('loading vocabulary file {}'.format(vocab_file))
+        else:
+            logger.info('loading vocabulary file {} from cache at {}'.format(
+                vocab_file, resolved_vocab_file))
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
+            # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
+            # than the number of positional embeddings
+            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[
+                pretrained_model_name_or_path]
+            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
+        # Instantiate tokenizer.
+        tokenizer = cls(resolved_vocab_file, *inputs, **kwargs)
+        return tokenizer
+
+
+class BasicTokenizer(object):
+    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
+
+    def __init__(self,
+                 do_lower_case=True,
+                 never_split=('[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]')):
+        """Constructs a BasicTokenizer.
+
+        Args:
+          do_lower_case: Whether to lower case the input.
+        """
+        self.do_lower_case = do_lower_case
+        self.never_split = never_split
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text."""
+        text = self._clean_text(text)
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        text = self._tokenize_chinese_chars(text)
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        for token in orig_tokens:
+            if self.do_lower_case and token not in self.never_split:
+                token = token.lower()
+                token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token))
+
+        output_tokens = whitespace_tokenize(' '.join(split_tokens))
+        return output_tokens
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize('NFD', text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == 'Mn':
+                continue
+            output.append(char)
+        return ''.join(output)
+
+    def _run_split_on_punc(self, text):
+        """Splits punctuation on a piece of text."""
+        if text in self.never_split:
+            return [text]
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return [''.join(x) for x in output]
+
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(' ')
+                output.append(char)
+                output.append(' ')
+            else:
+                output.append(char)
+        return ''.join(output)
+
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  # noqa
+            (cp >= 0x3400 and cp <= 0x4DBF) or  # noqa
+            (cp >= 0x20000 and cp <= 0x2A6DF) or  # noqa
+            (cp >= 0x2A700 and cp <= 0x2B73F) or  # noqa
+            (cp >= 0x2B740 and cp <= 0x2B81F) or  # noqa
+            (cp >= 0x2B820 and cp <= 0x2CEAF) or  # noqa
+            (cp >= 0xF900 and cp <= 0xFAFF) or  # noqa
+            (cp >= 0x2F800 and cp <= 0x2FA1F)):  # noqa
+            return True
+
+        return False
+
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xfffd or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(' ')
+            else:
+                output.append(char)
+        return ''.join(output)
+
+
+class WordpieceTokenizer(object):
+    """Runs WordPiece tokenization."""
+
+    def __init__(self, vocab, unk_token='[UNK]', max_input_chars_per_word=100):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text into its word pieces.
+
+        This uses a greedy longest-match-first algorithm to perform tokenization
+        using the given vocabulary.
+
+        For example:
+          input = "unaffable"
+          output = ["un", "##aff", "##able"]
+
+        Args:
+          text: A single token or whitespace separated tokens. This should have
+            already been passed through `BasicTokenizer`.
+
+        Returns:
+          A list of wordpiece tokens.
+        """
+
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = ''.join(chars[start:end])
+                    if start > 0:
+                        substr = '##' + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+
+
+def _is_whitespace(char):
+    """Checks whether `chars` is a whitespace character."""
+    # \t, \n, and \r are technically contorl characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == ' ' or char == '\t' or char == '\n' or char == '\r':
+        return True
+    cat = unicodedata.category(char)
+    if cat == 'Zs':
+        return True
+    return False
+
+
+def _is_control(char):
+    """Checks whether `chars` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == '\t' or char == '\n' or char == '\r':
+        return False
+    cat = unicodedata.category(char)
+    if cat.startswith('C'):
+        return True
+    return False
+
+
+def _is_punctuation(char):
+    """Checks whether `chars` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64)
+            or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith('P'):
+        return True
+    return False
diff --git a/modelscope/models/nlp/mglm/fp16/__init__.py b/modelscope/models/nlp/mglm/fp16/__init__.py
new file mode 100644
index 00000000..90d20bcf
--- /dev/null
+++ b/modelscope/models/nlp/mglm/fp16/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .fp16 import *  # noqa
+from .fp16util import (BN_convert_float, FP16Model, clip_grad_norm,
+                       convert_module, convert_network,
+                       master_params_to_model_params,
+                       model_grads_to_master_grads, network_to_half,
+                       prep_param_lists, to_python_float, tofp16)
+from .loss_scaler import *  # noqa
diff --git a/modelscope/models/nlp/mglm/fp16/fp16.py b/modelscope/models/nlp/mglm/fp16/fp16.py
new file mode 100755
index 00000000..10fbd804
--- /dev/null
+++ b/modelscope/models/nlp/mglm/fp16/fp16.py
@@ -0,0 +1,660 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Stable version of apex FP16 Optimizer"""
+import torch
+from torch import nn
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+from torch.autograd import Variable
+from torch.nn.parameter import Parameter
+
+from .fp16util import (clip_grad_norm, master_params_to_model_params,
+                       model_grads_to_master_grads)
+from .loss_scaler import DynamicLossScaler, LossScaler
+
+FLOAT_TYPES = (torch.FloatTensor, torch.cuda.FloatTensor)
+HALF_TYPES = (torch.HalfTensor, torch.cuda.HalfTensor)
+
+
+def conversion_helper(val, conversion):
+    """Apply conversion to val. Recursively apply conversion if `val` is a nested tuple/list structure."""
+    if not isinstance(val, (tuple, list)):
+        return conversion(val)
+    rtn = [conversion_helper(v, conversion) for v in val]
+    if isinstance(val, tuple):
+        rtn = tuple(rtn)
+    return rtn
+
+
+def fp32_to_fp16(val):
+    """Convert fp32 `val` to fp16"""
+
+    def half_conversion(val):
+        val_typecheck = val
+        if isinstance(val_typecheck, (Parameter, Variable)):
+            val_typecheck = val.data
+        if isinstance(val_typecheck, FLOAT_TYPES):
+            val = val.half()
+        return val
+
+    return conversion_helper(val, half_conversion)
+
+
+def fp16_to_fp32(val):
+    """Convert fp16 `val` to fp32"""
+
+    def float_conversion(val):
+        val_typecheck = val
+        if isinstance(val_typecheck, (Parameter, Variable)):
+            val_typecheck = val.data
+        if isinstance(val_typecheck, HALF_TYPES):
+            val = val.float()
+        return val
+
+    return conversion_helper(val, float_conversion)
+
+
+class FP16_Module(nn.Module):
+
+    def __init__(self, module):
+        super(FP16_Module, self).__init__()
+        self.add_module('module', module.half())
+
+    def forward(self, *inputs, **kwargs):
+        return fp16_to_fp32(self.module(*(fp32_to_fp16(inputs)), **kwargs))
+
+    def named_parameters(self, prefix: str = '', recurse: bool = True):
+        return self.module.named_parameters(prefix=prefix, recurse=recurse)
+
+    def state_dict(self, destination=None, prefix='', keep_vars=False):
+        return self.module.state_dict(destination, prefix, keep_vars)
+
+    def load_state_dict(self, state_dict, strict=True):
+        return self.module.load_state_dict(state_dict, strict=strict)
+
+
+# TODO:  Update overflow check + downscale to use Carl's fused kernel.
+class FP16_Optimizer(object):
+    """
+    :class:`FP16_Optimizer` is designed to wrap an existing PyTorch optimizer,
+    and manage static or dynamic loss scaling and master weights in a manner transparent to the user.
+    For standard use, only two lines must be changed:  creating the :class:`FP16_Optimizer` instance,
+    and changing the call to ``backward``.
+
+    Example::
+
+        model = torch.nn.Linear(D_in, D_out).cuda().half()
+        optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
+        # Name the FP16_Optimizer instance to replace the existing optimizer
+        # (recommended but not required):
+        optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
+        ...
+        # loss.backward() becomes:
+        optimizer.backward(loss)
+        ...
+
+    Example with dynamic loss scaling::
+
+        ...
+        optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
+                                   # optional arg to control dynamic loss scaling behavior
+                                   # dynamic_loss_args={'scale_window' : 500})
+                                   # Usually, dynamic_loss_args is not necessary.
+
+    Args:
+        init_optimizer (torch.optim.optimizer):  Existing optimizer created with the parameters to optimize.  Internally, :class:`FP16_Optimizer` replaces the passed optimizer's fp16 parameters, if any, with fp32 master parameters copied from the original ones.  :class:`FP16_Optimizer` also stores references to the original fp16 parameters, and updates these fp16 parameters from the master fp32 copy at the end of each :attr:`step`.
+        static_loss_scale (float, optional, default=1.0):  Loss scale used internally to scale gradients computed by the model.  Any fp16 gradients will be copied to fp32, then downscaled before being applied to the fp32 master params, so ``static_loss_scale`` should not affect learning rate.
+        dynamic_loss_scale (bool, optional, default=False):  Use dynamic loss scaling.  If True, this will override any ``static_loss_scale`` option.
+        dynamic_loss_args (dict, optional, default=None):  Dict of kwargs that will be forwarded to the internal :class:`DynamicLossScaler` instance's constructor.  Keys of this dict must match kwargs accepted by :class:`DynamicLossScaler`'s constructor.  If ``dynamic_loss_args`` is unspecified, :class:`DynamicLossScaler`'s defaults will be used.
+        verbose (bool, optional, default=True):  By default, FP16_Optimizer's constructor prints out the parameters and parameter groups it is ingesting, as a sanity check.  If this becomes annoying (e.g. for large models), it can be disabled by passing ``verbose=False``.  ``verbose=False`` will not disable printing when the loss scale is readjusted during dynamic loss scaling.
+
+    ``init_optimizer`` is expected to have been constructed in the ordinary way.
+    It is recommended (although not required) that the newly constructed :class:`FP16_Optimizer` instance be
+    named to replace ``init_optimizer``, for two reasons:
+    First, it means that references to the same name
+    later in the file will not have to change.
+    Second, :class:`FP16_Optimizer` reserves the right (as an implementation detail) to
+    modify ``init_optimizer``.  If you do choose a unique name for the new
+    :class:`FP16_Optimizer` instance, you should only work with this new instance,
+    because the preexisting optimizer might no longer behave as expected.
+
+    ``init_optimizer`` may be any Pytorch optimizer.
+    It may contain a mixture of fp16 and fp32 parameters organized into any number of
+    ``param_groups`` with different hyperparameters.  The :class:`FP16_Optimizer` constructor will
+    ingest these ``param_groups`` and remember them.
+
+    Calls to ::
+
+        loss.backward()
+
+    must be replaced with ::
+
+        optimizer.backward(loss)
+
+    because :class:`FP16_Optimizer` requires ownership of the backward pass to implement
+    loss scaling and copies to master gradients.
+
+    .. note::
+        Loss scaling, either static or dynamic, is orthogonal to learning rate, because gradients
+        are downscaled before being applied.  This means that adjusting the loss scale, or using
+        dynamic loss scaling, should not require retuning the learning rate or any other
+        hyperparameters.
+
+
+    **Advanced options**
+
+    **Closures**:  :class:`FP16_Optimizer` can wrap a Pytorch optimizer that receives a closure.
+    See docstring for :attr:`step`.
+
+    **Gradient clipping**:  Use :attr:`clip_master_grads`.
+
+    **Multiple losses**:  If your model accumulates gradients from multiple losses,
+    this can be made more efficient by supplying ``update_master_grads=False``
+    to :attr:`backward`.  See docstring for :attr:`backward`.
+
+    **Manually adjusting loss scale**:  The current loss scale can be retrieved or set via ::
+
+        print(optimizer.loss_scale)
+        optimizer.loss_scale = new_loss_scale
+
+    For static loss scaling, manually adjusting the loss scale over time is a reasonable
+    thing to do.  During later epochs, gradients may become smaller, and a
+    higher loss scale may be required, analogous to scheduling the learning rate.  Dynamic loss
+    scaling is more subtle (see :class:`DynamicLossScaler`) and in this case, manually adjusting
+    the loss scale is not recommended.
+
+    **Multi_GPU training**:  If the wrapped ``init_optimizer`` was created from a model wrapped in
+    Pytorch DistributedDataParallel or Apex DistributedDataParallel, :class:`FP16_Optimizer`
+    should still work as intended.
+    """ # noqa
+
+    def __init__(self,
+                 init_optimizer,
+                 static_loss_scale=1.0,
+                 dynamic_loss_scale=False,
+                 dynamic_loss_args=None,
+                 verbose=False):
+        if not torch.cuda.is_available:
+            raise SystemError('Cannot use fp16 without CUDA.')
+
+        self.verbose = verbose
+
+        self.optimizer = init_optimizer
+        # init_state_dict sets up an alternative way to cast per-param state tensors.
+        # Stashing here in case https://github.com/pytorch/pytorch/issues/7733 makes it necessary.
+        # init_state_dict = init_optimizer.state_dict()
+
+        self.fp16_groups = []
+        self.fp32_from_fp16_groups = []
+        self.fp32_from_fp32_groups = []
+        for i, param_group in enumerate(self.optimizer.param_groups):
+            self.maybe_print(
+                'FP16_Optimizer processing param group {}:'.format(i))
+            fp16_params_this_group = []
+            fp32_params_this_group = []
+            fp32_from_fp16_params_this_group = []
+            for i, param in enumerate(param_group['params']):
+                if param.requires_grad:
+                    if param.type() == 'torch.cuda.HalfTensor':
+                        self.maybe_print(
+                            'FP16_Optimizer received torch.cuda.HalfTensor with {}'
+                            .format(param.size()))
+                        fp16_params_this_group.append(param)
+                        master_param = param.detach().clone().float()
+                        master_param.requires_grad = True
+                        # Copythe model parallel flag.
+                        master_param.model_parallel = param.model_parallel
+                        param_group['params'][i] = master_param
+                        fp32_from_fp16_params_this_group.append(master_param)
+                        # Reset existing state dict key to the new master param.
+                        # We still need to recast per-param state tensors, if any, to FP32.
+                        if param in self.optimizer.state:
+                            self.optimizer.state[
+                                master_param] = self.optimizer.state.pop(param)
+                    elif param.type() == 'torch.cuda.FloatTensor':
+                        self.maybe_print(
+                            'FP16_Optimizer received torch.cuda.FloatTensor with {}'
+                            .format(param.size()))
+                        fp32_params_this_group.append(param)
+                        param_group['params'][i] = param
+                    else:
+                        raise TypeError(
+                            'Wrapped parameters must be either '
+                            'torch.cuda.FloatTensor or torch.cuda.HalfTensor. '
+                            'Received {}'.format(param.type()))
+
+            self.fp16_groups.append(fp16_params_this_group)
+            self.fp32_from_fp16_groups.append(fp32_from_fp16_params_this_group)
+            self.fp32_from_fp32_groups.append(fp32_params_this_group)
+
+        # Leverage state_dict() and load_state_dict() to recast preexisting per-param state tensors
+        self.optimizer.load_state_dict(self.optimizer.state_dict())
+        # alternative way to cast per-param state tensors:
+        # self.optimizer.load_state_dict(init_state_dict)
+
+        if dynamic_loss_scale:
+            self.dynamic_loss_scale = True
+            if dynamic_loss_args is not None:
+                self.loss_scaler = DynamicLossScaler(**dynamic_loss_args)
+            else:
+                self.loss_scaler = DynamicLossScaler()
+        else:
+            self.dynamic_loss_scale = False
+            self.loss_scaler = LossScaler(static_loss_scale)
+
+        self.overflow = False
+        self.first_closure_call_this_step = True
+
+        self.clip_grad_norm = clip_grad_norm
+
+    def maybe_print(self, msg):
+        if self.verbose:
+            print(msg)
+
+    def __getstate__(self):
+        raise RuntimeError(
+            'FP16_Optimizer should be serialized using state_dict().')
+
+    def __setstate__(self, state):
+        raise RuntimeError(
+            'FP16_Optimizer should be deserialized using load_state_dict().')
+
+    def zero_grad(self, set_grads_to_None=False):
+        """
+        Zero fp32 and fp16 parameter grads.
+        """
+        # In principle, only the .grad attributes of the model params need to be zeroed,
+        # because gradients are copied into the FP32 master params.  However, we zero
+        # all gradients owned by the optimizer, just to be safe:
+        for group in self.optimizer.param_groups:
+            for p in group['params']:
+                if set_grads_to_None:
+                    p.grad = None
+                else:
+                    if p.grad is not None:
+                        p.grad.detach_()
+                        p.grad.zero_()
+
+        # Zero fp16 gradients owned by the model:
+        for fp16_group in self.fp16_groups:
+            for param in fp16_group:
+                if set_grads_to_None:
+                    param.grad = None
+                else:
+                    if param.grad is not None:
+                        param.grad.detach_(
+                        )  # as in torch.optim.optimizer.zero_grad()
+                        param.grad.zero_()
+
+    def _check_overflow(self):
+        params = []
+        for group in self.fp16_groups:
+            for param in group:
+                params.append(param)
+        for group in self.fp32_from_fp32_groups:
+            for param in group:
+                params.append(param)
+        self.overflow = self.loss_scaler.has_overflow(params)
+
+    def _update_scale(self, has_overflow=False):
+        self.loss_scaler.update_scale(has_overflow)
+
+    def _master_params_to_model_params(self):
+        for fp16_group, fp32_from_fp16_group in zip(
+                self.fp16_groups, self.fp32_from_fp16_groups):
+            master_params_to_model_params(fp16_group, fp32_from_fp16_group)
+
+    def _model_params_to_master_params(self):
+        for fp16_group, fp32_from_fp16_group in zip(
+                self.fp16_groups, self.fp32_from_fp16_groups):
+            master_params_to_model_params(fp32_from_fp16_group, fp16_group)
+
+    # To consider:  Integrate distributed with this wrapper by registering a hook on each variable
+    # that does the overflow check, gradient copy + downscale, and fp32 allreduce in a different stream.
+    def _model_grads_to_master_grads(self):
+        for fp16_group, fp32_from_fp16_group in zip(
+                self.fp16_groups, self.fp32_from_fp16_groups):
+            model_grads_to_master_grads(fp16_group, fp32_from_fp16_group)
+
+    def _downscale_master(self):
+        if self.loss_scale != 1.0:
+            for group in self.optimizer.param_groups:
+                for param in group['params']:
+                    if param.grad is not None:
+                        param.grad.data.mul_(1. / self.loss_scale)
+
+    def clip_master_grads(self, max_norm, norm_type=2):
+        """
+        Clips fp32 master gradients via ``torch.nn.utils.clip_grad_norm``.
+
+        Args:
+            max_norm (float or int): max norm of the gradients
+            norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
+                infinity norm.
+
+        Returns:
+            Total norm of the current fp32 gradients (viewed as a single vector).
+
+        .. warning::
+            Returns -1 if the most recently computed fp16 gradients overflowed (that is, if ``self.overflow`` is ``True``).
+        """ # noqa
+        if not self.overflow:
+            fp32_params = []
+            for param_group in self.optimizer.param_groups:
+                for param in param_group['params']:
+                    fp32_params.append(param)
+            return self.clip_grad_norm(fp32_params, max_norm, norm_type)
+        else:
+            return -1
+
+    def state_dict(self):
+        """
+        Returns a dict containing the current state of this :class:`FP16_Optimizer` instance.
+        This dict contains attributes of :class:`FP16_Optimizer`, as well as the state_dict
+        of the contained Pytorch optimizer.
+        Example::
+
+            checkpoint = {}
+            checkpoint['model'] = model.state_dict()
+            checkpoint['optimizer'] = optimizer.state_dict()
+            torch.save(checkpoint, "saved.pth")
+        """
+        state_dict = {}
+        state_dict['loss_scaler'] = self.loss_scaler
+        state_dict['dynamic_loss_scale'] = self.dynamic_loss_scale
+        state_dict['overflow'] = self.overflow
+        state_dict[
+            'first_closure_call_this_step'] = self.first_closure_call_this_step
+        state_dict['optimizer_state_dict'] = self.optimizer.state_dict()
+        state_dict['fp32_from_fp16'] = self.fp32_from_fp16_groups
+        return state_dict
+
+    def load_state_dict(self, state_dict):
+        """
+        Loads a state_dict created by an earlier call to state_dict().
+        If ``fp16_optimizer_instance`` was constructed from some ``init_optimizer``,
+        whose parameters in turn came from ``model``, it is expected that the user
+        will call ``model.load_state_dict()`` before
+        ``fp16_optimizer_instance.load_state_dict()`` is called.
+
+        Example::
+
+            model = torch.nn.Linear(D_in, D_out).cuda().half()
+            optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
+            optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
+            ...
+            checkpoint = torch.load("saved.pth")
+            model.load_state_dict(checkpoint['model'])
+            optimizer.load_state_dict(checkpoint['optimizer'])
+        """
+        # I think it should actually be ok to reload the optimizer before the model.
+        self.loss_scaler = state_dict['loss_scaler']
+        self.dynamic_loss_scale = state_dict['dynamic_loss_scale']
+        self.overflow = state_dict['overflow']
+        self.first_closure_call_this_step = state_dict[
+            'first_closure_call_this_step']
+        self.optimizer.load_state_dict(state_dict['optimizer_state_dict'])
+        # At this point, the optimizer's references to the model's fp32 parameters are up to date.
+        # The optimizer's hyperparameters and internal buffers are also up to date.
+        # However, the fp32 master copies of the model's fp16 params stored by the optimizer are still
+        # out of date.  There are two options.
+        # 1:  Refresh the master params from the model's fp16 params.
+        # This requires less storage but incurs precision loss.
+        # 2:  Save and restore the fp32 master copies separately.
+        # We choose option 2.
+        #
+        # Pytorch Optimizer.load_state_dict casts saved buffers (e.g. momentum) to the type and device
+        # of their associated parameters, because it's possible those buffers might not exist yet in
+        # the current optimizer instance.  In our case, as long as the current FP16_Optimizer has been
+        # constructed in the same way as the one whose state_dict we are loading, the same master params
+        # are guaranteed to exist, so we can just copy_() from the saved master params.
+        for current_group, saved_group in zip(self.fp32_from_fp16_groups,
+                                              state_dict['fp32_from_fp16']):
+            for current, saved in zip(current_group, saved_group):
+                current.data.copy_(saved.data)
+
+    def step(self, closure=None):  # could add clip option.
+        """
+        If no closure is supplied, :attr:`step` should be called after
+        ``fp16_optimizer_obj.backward(loss)``.
+        :attr:`step` updates the fp32 master copy of parameters using the optimizer supplied to
+        :class:`FP16_Optimizer`'s constructor, then copies the updated fp32 params into the fp16 params
+        originally referenced by :class:`FP16_Optimizer`'s constructor, so the user may immediately run
+        another forward pass using their model.
+
+        If a closure is supplied, :attr:`step` may be called without a prior call to
+        :attr:`backward(loss)`.
+        This control flow is identical to `ordinary Pytorch optimizer use`_ with closures.
+        However, the user should take care that any ``loss.backward()`` call within the closure
+        has been replaced by ``fp16_optimizer_obj.backward(loss)``.
+
+        Args:
+           closure (optional):  Closure that will be supplied to the underlying optimizer originally passed to :class:`FP16_Optimizer`'s constructor.  closure should call :attr:`zero_grad()` on the :class:`FP16_Optimizer` object, compute the loss, call :attr:`backward(loss)`, and return the loss.
+
+        Example with closure::
+
+            # optimizer is assumed to be an FP16_Optimizer object, previously constructed from an
+            # existing pytorch optimizer.
+            for input, target in dataset:
+                def closure():
+                    optimizer.zero_grad()
+                    output = model(input)
+                    loss = loss_fn(output, target)
+                    # loss.backward() becomes:
+                    optimizer.backward(loss)
+                    return loss
+                optimizer.step(closure)
+
+        .. warning::
+            Currently, calling :attr:`step` with a closure is not compatible with dynamic loss scaling.
+
+        .. _`ordinary Pytorch optimizer use`:
+            http://pytorch.org/docs/master/optim.html#optimizer-step-closure
+        """ # noqa
+
+        scale = self.loss_scaler.loss_scale
+        self._update_scale(self.overflow)
+
+        if self.overflow:
+            self.maybe_print(
+                'OVERFLOW! Skipping step. Attempted loss scale: {}, reducing to {}'
+                .format(scale, self.loss_scale))
+            return
+
+        if closure is not None:
+            retval = self._step_with_closure(closure)
+        else:
+            retval = self.optimizer.step()
+
+        self._master_params_to_model_params()
+
+        return retval
+
+    def _step_with_closure(self, closure):
+
+        def wrapped_closure():
+            # helpful for debugging
+            # print("Calling wrapped_closure, first_closure_call_this_step = {}"
+            #       .format(self.first_closure_call_this_step))
+            if self.first_closure_call_this_step:
+                # We expect that the fp16 params are initially fresh on entering self.step(),
+                # so _master_params_to_model_params() is unnecessary the first time wrapped_closure()
+                # is called within self.optimizer.step().
+                self.first_closure_call_this_step = False
+            else:
+                # If self.optimizer.step() internally calls wrapped_closure more than once,
+                # it may update the fp32 params after each call.  However, self.optimizer
+                # doesn't know about the fp16 params at all.  If the fp32 params get updated,
+                # we can't rely on self.optimizer to refresh the fp16 params.  We need
+                # to handle that manually:
+                self._master_params_to_model_params()
+            # Our API expects the user to give us ownership of the backward() call by
+            # replacing all calls to loss.backward() with optimizer.backward(loss).
+            # This requirement holds whether or not the call to backward() is made within a closure.
+            # If the user is properly calling optimizer.backward(loss) within "closure,"
+            # calling closure() here will give the fp32 master params fresh gradients
+            # for the optimizer to play with, so all wrapped_closure needs to do is call
+            # closure() and return the loss.
+            temp_loss = closure()
+            while (self.overflow):
+                scale = self.loss_scaler.loss_scale
+                self._update_scale(self.overflow)
+                self.maybe_print(
+                    'OVERFLOW within closure! Skipping step. Attempted loss scale: {}, '
+                    'reducing to {}'.format(scale, self.loss_scale))
+                temp_loss = closure()
+            return temp_loss
+
+        retval = self.optimizer.step(wrapped_closure)
+
+        self.first_closure_call_this_step = True
+
+        return retval
+
+    def backward(self, loss, update_master_grads=True, retain_graph=False):
+        """
+        :attr:`backward` performs the following conceptual steps:
+
+        1. fp32_loss = loss.float() (see first Note below)
+        2. scaled_loss = fp32_loss*loss_scale
+        3. scaled_loss.backward(), which accumulates scaled gradients into the ``.grad`` attributes of the model's leaves (which may be fp16, fp32, or a mixture, depending how your model was defined).
+        4. fp16 grads are then copied to the master params' ``.grad`` attributes (see second Note), which are guaranteed to be fp32.
+        5. Finally, master grads are divided by loss_scale.
+
+        In this way, after :attr:`backward`, the master params have fresh gradients,
+        and :attr:`step` may be called.
+
+        .. note::
+            :attr:`backward` internally converts the loss to fp32 before applying the loss scale.
+            This provides some additional safety against overflow if the user has supplied an
+            fp16 loss value.
+            However, for maximum overflow safety, the user should
+            compute the loss criterion (MSE, cross entropy, etc) in fp32 before supplying it to
+            :attr:`backward`.
+
+        .. warning::
+            The gradients found in a model's leaves after the call to
+            :attr:`backward` should not be regarded as valid in general,
+            because it's possible
+            they have been scaled (and in the case of dynamic loss scaling,
+            the scale factor may change over time).
+            If the user wants to inspect gradients after a call to :attr:`backward`,
+            only the master gradients should be regarded as valid.  These can be retrieved via
+            :attr:`inspect_master_grad_data()`.
+
+        Args:
+            loss:  The loss output by the user's model.  loss may be either float or half (but see first Note above).
+            update_master_grads (bool, optional, default=True):  Option to copy fp16 grads to fp32 grads on this call.  By setting this to False, the user can delay the copy, which is useful to eliminate redundant fp16->fp32 grad copies if :attr:`backward` is being called on multiple losses in one iteration.  If set to False, the user becomes responsible for calling :attr:`update_master_grads` before calling :attr:`step`.
+            retain_graph (bool, optional, default=False):  Forwards the usual ``retain_graph=True`` option to the internal call to ``loss.backward``.  If ``retain_graph`` is being used to accumulate gradient values from multiple backward passes before calling ``optimizer.step``, passing ``update_master_grads=False`` is also recommended (see Example below).
+
+        Example::
+
+            # Ordinary operation:
+            optimizer.backward(loss)
+
+            # Naive operation with multiple losses (technically valid, but less efficient):
+            # fp32 grads will be correct after the second call,  but
+            # the first call incurs an unnecessary fp16->fp32 grad copy.
+            optimizer.backward(loss1)
+            optimizer.backward(loss2)
+
+            # More efficient way to handle multiple losses:
+            # The fp16->fp32 grad copy is delayed until fp16 grads from all
+            # losses have been accumulated.
+            optimizer.backward(loss1, update_master_grads=False)
+            optimizer.backward(loss2, update_master_grads=False)
+            optimizer.update_master_grads()
+        """ # noqa
+        # To consider:  try multiple backward passes using retain_grad=True to find
+        # a loss scale that works.  After you find a loss scale that works, do a final dummy
+        # backward pass with retain_graph=False to tear down the graph.  Doing this would avoid
+        # discarding the iteration,  but probably wouldn't improve overall efficiency.
+        self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
+        if update_master_grads:
+            self.update_master_grads()
+
+    def update_master_grads(self):
+        """
+        Copy the ``.grad`` attribute from stored references to fp16 parameters to
+        the ``.grad`` attribute of the fp32 master parameters that are directly
+        updated by the optimizer.  :attr:`update_master_grads` only needs to be called if
+        ``fp16_optimizer_obj.backward`` was called with ``update_master_grads=False``.
+        """ # noqa
+        if self.dynamic_loss_scale:
+            self._check_overflow()
+            if self.overflow: return  # noqa
+        self._model_grads_to_master_grads()
+        self._downscale_master()
+
+    def inspect_master_grad_data(self):
+        """
+        When running with :class:`FP16_Optimizer`,
+        ``.grad`` attributes of a model's fp16 leaves should not be
+        regarded as truthful, because they might be scaled.
+        After a call to :attr:`fp16_optimizer_obj.backward(loss)`, if no overflow was encountered,
+        the fp32 master params' ``.grad``
+        attributes will contain valid gradients properly divided by the loss scale.  However,
+        because :class:`FP16_Optimizer` flattens some parameters, accessing them may be
+        nonintuitive.  :attr:`inspect_master_grad_data`
+        allows those gradients to be viewed with shapes corresponding to their associated model leaves.
+
+        Returns:
+            List of lists (one list for each parameter group).  The list for each parameter group
+            is a list of the ``.grad.data`` attributes of the fp32 master params belonging to that group.
+        """
+        if self.overflow:
+            print(
+                'Warning:  calling FP16_Optimizer.inspect_master_grad_data while in an overflow state.  '
+                'Gradients are currently invalid (may be inf, nan, or stale).  Returning None.'
+            )
+            return None
+        else:
+            # The optimizer owns only references to master params.
+            master_grads_data = []
+            for param_group in self.optimizer.param_groups:
+                master_grads_this_group = []
+                for param in param_group['params']:
+                    if param.grad is not None:
+                        master_grads_this_group.append(param.grad.data)
+                    else:
+                        master_grads_this_group.append(None)
+                master_grads_data.append(master_grads_this_group)
+            return master_grads_data
+
+    # Promote loss scale so it can be retrieved or set via "fp16_optimizer_instance.loss_scale"
+    def _get_loss_scale(self):
+        return self.loss_scaler.loss_scale
+
+    def _set_loss_scale(self, value):
+        self.loss_scaler.cur_scale = value
+
+    loss_scale = property(_get_loss_scale, _set_loss_scale)
+
+    # Promote state so it can be retrieved or set via "fp16_optimizer_instance.state"
+    def _get_state(self):
+        return self.optimizer.state
+
+    def _set_state(self, value):
+        self.optimizer.state = value
+
+    state = property(_get_state, _set_state)
+
+    # Promote param_groups so it can be retrieved or set via "fp16_optimizer_instance.param_groups"
+    # (for example, to adjust the learning rate)
+    def _get_param_groups(self):
+        return self.optimizer.param_groups
+
+    def _set_param_groups(self, value):
+        self.optimizer.param_groups = value
+
+    param_groups = property(_get_param_groups, _set_param_groups)
diff --git a/modelscope/models/nlp/mglm/fp16/fp16util.py b/modelscope/models/nlp/mglm/fp16/fp16util.py
new file mode 100644
index 00000000..3fcd3005
--- /dev/null
+++ b/modelscope/models/nlp/mglm/fp16/fp16util.py
@@ -0,0 +1,220 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+from torch.autograd import Variable
+
+from modelscope.models.nlp.mglm import mpu
+
+
+class tofp16(nn.Module):
+    """
+    Utility module that implements::
+
+        def forward(self, input):
+            return input.half()
+    """
+
+    def __init__(self):
+        super(tofp16, self).__init__()
+
+    def forward(self, input):
+        return input.half()
+
+
+def BN_convert_float(module):
+    """
+    Utility function for network_to_half().
+
+    Retained for legacy purposes.
+    """
+    if isinstance(
+            module,
+            torch.nn.modules.batchnorm._BatchNorm) and module.affine is True:
+        module.float()
+    for child in module.children():
+        BN_convert_float(child)
+    return module
+
+
+def network_to_half(network):
+    """
+    Convert model to half precision in a batchnorm-safe way.
+
+    Retained for legacy purposes. It is recommended to use FP16Model.
+    """
+    return nn.Sequential(tofp16(), BN_convert_float(network.half()))
+
+
+def convert_module(module, dtype):
+    """
+    Converts a module's immediate parameters and buffers to dtype.
+    """
+    for param in module.parameters(recurse=False):
+        if param is not None:
+            if param.data.dtype.is_floating_point:
+                param.data = param.data.to(dtype=dtype)
+            if param._grad is not None and param._grad.data.dtype.is_floating_point:
+                param._grad.data = param._grad.data.to(dtype=dtype)
+
+    for buf in module.buffers(recurse=False):
+        if buf is not None and buf.data.dtype.is_floating_point:
+            buf.data = buf.data.to(dtype=dtype)
+
+
+def convert_network(network, dtype):
+    """
+    Converts a network's parameters and buffers to dtype.
+    """
+    for module in network.modules():
+        if isinstance(module, torch.nn.modules.batchnorm._BatchNorm
+                      ) and module.affine is True:
+            continue
+        convert_module(module, dtype)
+    return network
+
+
+class FP16Model(nn.Module):
+    """
+    Convert model to half precision in a batchnorm-safe way.
+    """
+
+    def __init__(self, network):
+        super(FP16Model, self).__init__()
+        self.network = convert_network(network, dtype=torch.half)
+
+    def forward(self, *inputs):
+        inputs = tuple(t.half() for t in inputs)
+        return self.network(*inputs)
+
+
+def backwards_debug_hook(grad):
+    raise RuntimeError(
+        'master_params recieved a gradient in the backward pass!')
+
+
+def prep_param_lists(model, flat_master=False):
+    """
+    Creates a list of FP32 master parameters for a given model, as in
+    `Training Neural Networks with Mixed Precision:  Real Examples`_.
+
+    Args:
+        model (torch.nn.Module): Existing Pytorch model
+        flat_master (bool, optional, default=False):  Flatten the master parameters into a single tensor, as a performance optimization.
+    Returns:
+        A tuple (``model_params``, ``master_params``). ``model_params`` is a list of the model's parameters for later use with :func:`model_grads_to_master_grads` and :func:`master_params_to_model_params`.  ``master_params`` is a list of FP32 master gradients.  If ``flat_master=True``, ``master_params`` will be a list with one element.
+
+    Example::
+
+        model_params, master_params = prep_param_lists(model)
+
+    .. warning::
+        Currently, if ``flat_master=True``, all the model's parameters must be the same type.  If the model has parameters of different types, use ``flat_master=False``, or use :class:`FP16_Optimizer`.
+
+    .. _`Training Neural Networks with Mixed Precision:  Real Examples`:
+        http://on-demand.gputechconf.com/gtc/2018/video/S81012/
+    """ # noqa
+    model_params = [
+        param for param in model.parameters() if param.requires_grad
+    ]
+
+    if flat_master:
+        # Give the user some more useful error messages
+        try:
+            # flatten_dense_tensors returns a contiguous flat array.
+            # http://pytorch.org/docs/master/_modules/torch/_utils.html
+            master_params = _flatten_dense_tensors(
+                [param.data for param in model_params]).float()
+        except:  # noqa
+            print(
+                'Error in prep_param_lists:  model may contain a mixture of parameters '
+                'of different types.  Use flat_master=False, or use F16_Optimizer.'
+            )
+            raise
+        master_params = torch.nn.Parameter(master_params)
+        master_params.requires_grad = True
+        # master_params.register_hook(backwards_debug_hook)
+        if master_params.grad is None:
+            master_params.grad = master_params.new(*master_params.size())
+        return model_params, [master_params]
+    else:
+        master_params = [
+            param.clone().float().detach() for param in model_params
+        ]
+        for param in master_params:
+            param.requires_grad = True
+        return model_params, master_params
+
+
+def model_grads_to_master_grads(model_params,
+                                master_params,
+                                flat_master=False):
+    """
+    Copy model gradients to master gradients.
+
+    Args:
+        model_params:  List of model parameters created by :func:`prep_param_lists`.
+        master_params:  List of FP32 master parameters created by :func:`prep_param_lists`.  If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`model_grads_to_master_grads`.
+    """ # noqa
+    if flat_master:
+        # The flattening may incur one more deep copy than is necessary.
+        master_params[0].grad.data.copy_(
+            _flatten_dense_tensors([p.grad.data for p in model_params]))
+    else:
+        for model, master in zip(model_params, master_params):
+            if model.grad is not None:
+                if master.grad is None:
+                    master.grad = Variable(
+                        master.data.new(*master.data.size()))
+                master.grad.data.copy_(model.grad.data)
+            else:
+                master.grad = None
+
+
+def master_params_to_model_params(model_params,
+                                  master_params,
+                                  flat_master=False):
+    """
+    Copy master parameters to model parameters.
+
+    Args:
+        model_params:  List of model parameters created by :func:`prep_param_lists`.
+        master_params:  List of FP32 master parameters created by :func:`prep_param_lists`.  If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`master_params_to_model_params`.
+    """ # noqa
+    if flat_master:
+        for model, master in zip(
+                model_params,
+                _unflatten_dense_tensors(master_params[0].data, model_params)):
+            model.data.copy_(master)
+    else:
+        for model, master in zip(model_params, master_params):
+            model.data.copy_(master.data)
+
+
+# Backward compatibility fixes
+
+
+def to_python_float(t):
+    if hasattr(t, 'item'):
+        return t.item()
+    else:
+        return t[0]
+
+
+TORCH_MAJOR = int(torch.__version__.split('.')[0])
+TORCH_MINOR = int(torch.__version__.split('.')[1])
+
+clip_grad_norm = mpu.clip_grad_norm
diff --git a/modelscope/models/nlp/mglm/fp16/loss_scaler.py b/modelscope/models/nlp/mglm/fp16/loss_scaler.py
new file mode 100755
index 00000000..721571b3
--- /dev/null
+++ b/modelscope/models/nlp/mglm/fp16/loss_scaler.py
@@ -0,0 +1,245 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from modelscope.models.nlp.mglm import mpu
+
+
+# item() is a recent addition, so this helps with backward compatibility.
+def to_python_float(t):
+    if hasattr(t, 'item'):
+        return t.item()
+    else:
+        return t[0]
+
+
+class LossScaler:
+    """
+    Class that manages a static loss scale.  This class is intended to interact with
+    :class:`FP16_Optimizer`, and should not be directly manipulated by the user.
+
+    Use of :class:`LossScaler` is enabled via the ``static_loss_scale`` argument to
+    :class:`FP16_Optimizer`'s constructor.
+
+    Args:
+        scale (float, optional, default=1.0):  The loss scale.
+    """
+
+    def __init__(self, scale=1):
+        self.cur_scale = scale
+
+    # `params` is a list / generator of torch.Variable
+    def has_overflow(self, params):
+        return False
+
+    # `x` is a torch.Tensor
+    def _has_inf_or_nan(x):
+        return False
+
+    def update_scale(self, overflow):
+        pass
+
+    @property
+    def loss_scale(self):
+        return self.cur_scale
+
+    def scale_gradient(self, module, grad_in, grad_out):
+        return tuple(self.loss_scale * g for g in grad_in)
+
+    def backward(self, loss, retain_graph=False):
+        scaled_loss = loss * self.loss_scale
+        scaled_loss.backward(retain_graph=retain_graph)
+
+
+class DynamicLossScaler:
+    """
+    Class that manages dynamic loss scaling.  It is recommended to use :class:`DynamicLossScaler`
+    indirectly, by supplying ``dynamic_loss_scale=True`` to the constructor of
+    :class:`FP16_Optimizer`.  However, it's important to understand how :class:`DynamicLossScaler`
+    operates, because the default options can be changed using the
+    the ``dynamic_loss_args`` argument to :class:`FP16_Optimizer`'s constructor.
+
+    Loss scaling is designed to combat the problem of underflowing gradients encountered at long
+    times when training fp16 networks.  Dynamic loss scaling begins by attempting a very high loss
+    scale.  Ironically, this may result in OVERflowing gradients.  If overflowing gradients are
+    encountered, :class:`DynamicLossScaler` informs :class:`FP16_Optimizer` that an overflow has
+    occurred.
+    :class:`FP16_Optimizer` then skips the update step for this particular iteration/minibatch,
+    and :class:`DynamicLossScaler` adjusts the loss scale to a lower value.
+    If a certain number of iterations occur without overflowing gradients detected,
+    :class:`DynamicLossScaler` increases the loss scale once more.
+    In this way :class:`DynamicLossScaler` attempts to "ride the edge" of
+    always using the highest loss scale possible without incurring overflow.
+
+    Args:
+        init_scale (float, optional, default=2**32):  Initial loss scale attempted by :class:`DynamicLossScaler.`
+        scale_factor (float, optional, default=2.0):  Factor used when adjusting the loss scale. If an overflow is encountered, the loss scale is readjusted to loss scale/``scale_factor``.  If ``scale_window`` consecutive iterations take place without an overflow, the loss scale is readjusted to loss_scale*``scale_factor``.
+        scale_window (int, optional, default=1000):  Number of consecutive iterations without an overflow to wait before increasing the loss scale.
+    """ # noqa
+
+    def __init__(self,
+                 init_scale=2**32,
+                 scale_factor=2.,
+                 scale_window=1000,
+                 min_scale=1,
+                 delayed_shift=1,
+                 consecutive_hysteresis=False):
+        self.cur_scale = init_scale
+        self.cur_iter = 0
+        self.last_overflow_iter = -1
+        self.scale_factor = scale_factor
+        self.scale_window = scale_window
+        self.min_scale = min_scale
+        self.delayed_shift = delayed_shift
+        self.cur_hysteresis = delayed_shift
+        self.consecutive_hysteresis = consecutive_hysteresis
+
+    # `params` is a list / generator of torch.Variable
+    def has_overflow_serial(self, params):
+        for p in params:
+            if p.grad is not None and DynamicLossScaler._has_inf_or_nan(
+                    p.grad.data):
+                return True
+
+        return False
+
+    def has_overflow(self, params):
+        overflow = self.has_overflow_serial(params)
+        # Since each model parallel GPU carries only part of the model,
+        # make sure overflow flag is synced across all the model parallel GPUs
+        overflow_gpu = torch.cuda.ByteTensor([overflow])
+        torch.distributed.all_reduce(
+            overflow_gpu,
+            op=torch.distributed.ReduceOp.MAX,
+            group=mpu.get_model_parallel_group())
+        overflow = overflow_gpu[0].item()
+        return bool(overflow)
+
+    # `x` is a torch.Tensor
+    def _has_inf_or_nan(x):
+        try:
+            # if x is half, the .float() incurs an additional deep copy, but it's necessary if
+            # Pytorch's .sum() creates a one-element tensor of the same type as x
+            # (which is true for some recent version of pytorch).
+            cpu_sum = float(x.float().sum())
+            # More efficient version that can be used if .sum() returns a Python scalar
+            # cpu_sum = float(x.sum())
+        except RuntimeError as instance:
+            # We want to check if inst is actually an overflow exception.
+            # RuntimeError could come from a different error.
+            # If so, we still want the exception to propagate.
+            if 'value cannot be converted' not in instance.args[0]:
+                raise
+            return True
+        else:
+            if cpu_sum == float(
+                    'inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
+                return True
+            return False
+
+    # `overflow` is boolean indicating whether the gradient overflowed
+    def update_scale(self, overflow):
+
+        if not hasattr(self, 'min_scale'):
+            self.min_scale = 1
+        if not hasattr(self, 'delayed_shift'):
+            self.delayed_shift = 1
+        if not hasattr(self, 'cur_hysteresis'):
+            self.cur_hysteresis = 1
+        if not hasattr(self, 'consecutive_hysteresis'):
+            self.consecutive_hysteresis = True
+        if overflow:
+            # self.cur_scale /= self.scale_factor
+            if self.delayed_shift == 1 or self.cur_hysteresis == 1:
+                self.cur_scale = max(self.cur_scale / self.scale_factor,
+                                     self.min_scale)
+            else:
+                self.cur_hysteresis -= 1
+            self.last_overflow_iter = self.cur_iter
+        else:
+            if self.consecutive_hysteresis:
+                self.cur_hysteresis = self.delayed_shift
+            if (self.cur_iter
+                    - self.last_overflow_iter) % self.scale_window == 0:
+                if not self.consecutive_hysteresis:
+                    self.cur_hysteresis = self.delayed_shift
+                self.cur_scale *= self.scale_factor
+        self.cur_iter += 1
+
+    @property
+    def loss_scale(self):
+        return self.cur_scale
+
+    def scale_gradient(self, module, grad_in, grad_out):
+        return tuple(self.loss_scale * g for g in grad_in)
+
+    def backward(self, loss, retain_graph=False):
+        scaled_loss = loss * self.loss_scale
+        scaled_loss.backward(retain_graph=retain_graph)
+
+
+##############################################################
+# Example usage below here -- assuming it's in a separate file
+##############################################################
+"""
+TO-DO separate out into an example.
+if __name__ == "__main__":
+    import torch
+    from torch.autograd import Variable
+    from dynamic_loss_scaler import DynamicLossScaler
+
+    # N is batch size; D_in is input dimension;
+    # H is hidden dimension; D_out is output dimension.
+    N, D_in, H, D_out = 64, 1000, 100, 10
+
+    # Create random Tensors to hold inputs and outputs, and wrap them in Variables.
+    x = Variable(torch.randn(N, D_in), requires_grad=False)
+    y = Variable(torch.randn(N, D_out), requires_grad=False)
+
+    w1 = Variable(torch.randn(D_in, H), requires_grad=True)
+    w2 = Variable(torch.randn(H, D_out), requires_grad=True)
+    parameters = [w1, w2]
+
+    learning_rate = 1e-6
+    optimizer = torch.optim.SGD(parameters, lr=learning_rate)
+    loss_scaler = DynamicLossScaler()
+
+    for t in range(500):
+        y_pred = x.mm(w1).clamp(min=0).mm(w2)
+        loss = (y_pred - y).pow(2).sum() * loss_scaler.loss_scale
+        print('Iter {} loss scale: {}'.format(t, loss_scaler.loss_scale))
+        print('Iter {} scaled loss: {}'.format(t, loss.data[0]))
+        print('Iter {} unscaled loss: {}'.format(t, loss.data[0] / loss_scaler.loss_scale))
+
+        # Run backprop
+        optimizer.zero_grad()
+        loss.backward()
+
+        # Check for overflow
+        has_overflow = DynamicLossScaler.has_overflow(parameters)
+
+        # If no overflow, unscale grad and update as usual
+        if not has_overflow:
+            for param in parameters:
+                param.grad.data.mul_(1. / loss_scaler.loss_scale)
+            optimizer.step()
+        # Otherwise, don't do anything -- ie, skip iteration
+        else:
+            print('OVERFLOW!')
+
+        # Update loss scale for next iteration
+        loss_scaler.update_scale(has_overflow)
+
+"""
diff --git a/modelscope/models/nlp/mglm/generation_utils.py b/modelscope/models/nlp/mglm/generation_utils.py
new file mode 100644
index 00000000..6db75b2d
--- /dev/null
+++ b/modelscope/models/nlp/mglm/generation_utils.py
@@ -0,0 +1,483 @@
+# Copyright 2020 The HuggingFace Inc. team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+from collections import UserDict
+from typing import Iterable, List, Optional, Tuple
+
+import torch
+
+PROCESS_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size * num_beams, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using any class inheriting from :class:`~transformers.PretrainedTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+        next_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2 * num_beams)`):
+            Current scores of the top :obj:`2 * num_beams` non-finished beam hypotheses.
+        next_tokens (:obj:`torch.LongTensor` of shape :obj:`(batch_size, 2 * num_beams)`):
+            :obj:`input_ids` of the tokens corresponding to the top :obj:`2 * num_beams` non-finished beam hypotheses.
+        next_indices (:obj:`torch.LongTensor` of shape :obj:`(batch_size, 2 * num_beams)`):
+            Beam indices indicating to which beam hypothesis the :obj:`next_tokens` correspond.
+        pad_token_id (:obj:`int`, `optional`):
+            The id of the `padding` token.
+        eos_token_id (:obj:`int`, `optional`):
+            The id of the `end-of-sequence` token.
+
+    Return:
+        :obj:`UserDict`: A dictionary composed of the fields as defined above:
+
+            - **next_beam_scores** (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`) -- Updated
+              scores of all non-finished beams.
+            - **next_beam_tokens** (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`) -- Next tokens
+              to be added to the non-finished beam_hypotheses.
+            - **next_beam_indices** (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`) -- Beam indices
+              indicating to which beam the next tokens shall be added.
+
+"""
+
+FINALIZE_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size * num_beams, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using any class inheriting from :class:`~transformers.PretrainedTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+        final_beam_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`):
+            The final scores of all non-finished beams.
+        final_beam_tokens (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`):
+            The last tokens to be added to the non-finished beam_hypotheses.
+        final_beam_indices (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`):
+            The beam indices indicating to which beam the :obj:`final_beam_tokens` shall be added.
+        pad_token_id (:obj:`int`, `optional`):
+            The id of the `padding` token.
+        eos_token_id (:obj:`int`, `optional`):
+            The id of the `end-of-sequence` token.
+
+    Return:
+        :obj:`torch.LongTensor` of shape :obj:`(batch_size * num_return_sequences, sequence_length)`: The generated
+        sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or shorter if all
+        batches finished early due to the :obj:`eos_token_id`.
+
+"""
+
+
+class BeamScorer(ABC):
+    """
+    Abstract base class for all beam scorers that are used for :meth:`~transformers.PretrainedModel.beam_search` and
+    :meth:`~transformers.PretrainedModel.beam_sample`.
+    """
+
+    @abstractmethod
+    def process(self, input_ids: torch.LongTensor,
+                next_scores: torch.FloatTensor, next_tokens: torch.LongTensor,
+                next_indices: torch.LongTensor,
+                **kwargs) -> Tuple[torch.Tensor]:
+        raise NotImplementedError('This is an abstract method.')
+
+    @abstractmethod
+    def finalize(self, input_ids: torch.LongTensor,
+                 next_scores: torch.FloatTensor, next_tokens: torch.LongTensor,
+                 next_indices: torch.LongTensor, **kwargs) -> torch.LongTensor:
+        raise NotImplementedError('This is an abstract method.')
+
+
+class BeamSearchScorer(BeamScorer):
+    r"""
+    :class:`transformers.BeamScorer` implementing standard beam search decoding.
+
+    Adapted in part from `Facebook's XLM beam search code
+    <https://github.com/facebookresearch/XLM/blob/9e6f6814d17be4fe5b15f2e6c43eb2b2d76daeb4/src/model/transformer.py#L529>`__.
+
+    Args:
+        batch_size (:obj:`int`):
+            Batch Size of :obj:`input_ids` for which beam search decoding is run in parallel.
+        max_length (:obj:`int`):
+            The maximum length of the sequence to be generated.
+        num_beams (:obj:`int`):
+            Number of beams for beam search.
+        device (:obj:`torch.device`):
+            Defines the device type (*e.g.*, :obj:`"cpu"` or :obj:`"cuda"`) on which this instance of
+            :obj:`BeamSearchScorer` will be allocated.
+        length_penalty (:obj:`float`, `optional`, defaults to 1.0):
+            Exponential penalty to the length. 1.0 means no penalty. Set to values < 1.0 in order to encourage the
+            model to generate shorter sequences, to a value > 1.0 in order to encourage the model to produce longer
+            sequences.
+        do_early_stopping (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether to stop the beam search when at least ``num_beams`` sentences are finished per batch or not.
+        num_beam_hyps_to_keep (:obj:`int`, `optional`, defaults to 1):
+            The number of beam hypotheses that shall be returned upon calling
+            :meth:`~transformer.BeamSearchScorer.finalize`.
+    """
+
+    def __init__(
+        self,
+        batch_size: int,
+        max_length: int,
+        num_beams: int,
+        device: torch.device,
+        length_penalty: Optional[float] = 1.0,
+        do_early_stopping: Optional[bool] = False,
+        num_beam_hyps_to_keep: Optional[int] = 1,
+    ):
+        self.max_length = max_length
+        self.num_beams = num_beams
+        self.device = device
+        self.length_penalty = length_penalty
+        self.do_early_stopping = do_early_stopping
+        self.num_beam_hyps_to_keep = num_beam_hyps_to_keep
+
+        self._is_init = False
+        self._beam_hyps = [
+            BeamHypotheses(
+                num_beams=self.num_beams,
+                max_length=self.max_length,
+                length_penalty=self.length_penalty,
+                early_stopping=self.do_early_stopping,
+            ) for _ in range(batch_size)
+        ]
+        self._done = torch.tensor([False for _ in range(batch_size)],
+                                  dtype=torch.bool,
+                                  device=self.device)
+
+        # if not isinstance(num_beams, int) or num_beams <= 1:
+        #     raise ValueError(
+        #     )
+
+    @property
+    def is_done(self) -> bool:
+        return self._done.all()
+
+    def process(self,
+                input_ids: torch.LongTensor,
+                next_scores: torch.FloatTensor,
+                next_tokens: torch.LongTensor,
+                next_indices: torch.LongTensor,
+                pad_token_id: Optional[int] = None,
+                eos_token_id: Optional[int] = None,
+                mems=None) -> Tuple[torch.Tensor]:
+        cur_len = input_ids.shape[-1]
+        batch_size = len(self._beam_hyps)
+        assert batch_size == (input_ids.shape[0] // self.num_beams)
+        if isinstance(eos_token_id, int):
+            eos_token_id = [eos_token_id]
+        device = next_scores.device
+        next_beam_scores = torch.zeros((batch_size, self.num_beams),
+                                       dtype=next_scores.dtype,
+                                       device=device)
+        next_beam_tokens = torch.zeros((batch_size, self.num_beams),
+                                       dtype=next_tokens.dtype,
+                                       device=device)
+        next_beam_indices = torch.zeros((batch_size, self.num_beams),
+                                        dtype=next_indices.dtype,
+                                        device=device)
+
+        for batch_idx, beam_hyp in enumerate(self._beam_hyps):
+            if self._done[batch_idx]:
+                assert (
+                    len(beam_hyp) >= self.num_beams
+                ), 'Batch can only be done if at least {} beams have been generated'.format(
+                    self.num_beams)
+                assert (
+                    eos_token_id is not None and pad_token_id is not None
+                ), 'generated beams >= num_beams -> eos_token_id and pad_token have to be defined'
+                # pad the batch
+                next_beam_scores[batch_idx, :] = 0
+                next_beam_tokens[batch_idx, :] = pad_token_id
+                next_beam_indices[batch_idx, :] = 0
+                continue
+
+            # next tokens for this sentence
+            beam_idx = 0
+            for beam_token_rank, (next_token, next_score,
+                                  next_index) in enumerate(
+                                      zip(next_tokens[batch_idx],
+                                          next_scores[batch_idx],
+                                          next_indices[batch_idx])):
+                batch_beam_idx = batch_idx * self.num_beams + next_index
+                # add to generated hypotheses if end of sentence
+                if (eos_token_id is not None) and (next_token.item()
+                                                   in eos_token_id):
+                    # if beam_token does not belong to top num_beams tokens, it should not be added
+                    is_beam_token_worse_than_top_num_beams = beam_token_rank >= self.num_beams
+                    if is_beam_token_worse_than_top_num_beams:
+                        continue
+                    beam_hyp.add(
+                        input_ids[batch_beam_idx].clone(),
+                        next_score.item(),
+                        mems=[mem[[next_index.item()]]
+                              for mem in mems] if mems else None)
+                else:
+                    # add next predicted token since it is not eos_token
+                    next_beam_scores[batch_idx, beam_idx] = next_score
+                    next_beam_tokens[batch_idx, beam_idx] = next_token
+                    next_beam_indices[batch_idx, beam_idx] = batch_beam_idx
+                    beam_idx += 1
+
+                # once the beam for next step is full, don't add more tokens to it.
+                if beam_idx == self.num_beams:
+                    break
+
+            if beam_idx < self.num_beams:
+                raise ValueError(
+                    f'At most {self.num_beams} tokens in {next_tokens[batch_idx]} can be equal to `eos_token_id: {eos_token_id}`. Make sure {next_tokens[batch_idx]} are corrected.'  # noqa
+                )  # noqa
+
+            # Check if we are done so that we can save a pad step if all(done)
+            self._done[batch_idx] = self._done[batch_idx] or beam_hyp.is_done(
+                next_scores[batch_idx].max().item(), cur_len)
+
+        return UserDict({
+            'next_beam_scores': next_beam_scores.view(-1),
+            'next_beam_tokens': next_beam_tokens.view(-1),
+            'next_beam_indices': next_beam_indices.view(-1),
+        })
+
+    def finalize(self,
+                 input_ids: torch.LongTensor,
+                 final_beam_scores: torch.FloatTensor,
+                 final_beam_tokens: torch.LongTensor,
+                 final_beam_indices: torch.LongTensor,
+                 pad_token_id: Optional[int] = None,
+                 eos_token_id: Optional[int] = None,
+                 mems=None) -> Tuple[torch.LongTensor, List[torch.Tensor]]:
+        batch_size = len(self._beam_hyps)
+
+        # finalize all open beam hypotheses and add to generated hypotheses
+        for batch_idx, beam_hyp in enumerate(self._beam_hyps):
+            if self._done[batch_idx]:
+                continue
+
+            # need to add best num_beams hypotheses to generated hyps
+            for beam_id in range(self.num_beams):
+                batch_beam_idx = batch_idx * self.num_beams + beam_id
+                final_score = final_beam_scores[batch_beam_idx].item()
+                final_tokens = input_ids[batch_beam_idx]
+                beam_hyp.add(
+                    final_tokens,
+                    final_score,
+                    mems=[mem[[batch_beam_idx]]
+                          for mem in mems] if mems else None)
+
+        # select the best hypotheses
+        sent_lengths = input_ids.new(batch_size * self.num_beam_hyps_to_keep)
+        best = []
+
+        # retrieve best hypotheses
+        for i, beam_hyp in enumerate(self._beam_hyps):
+            sorted_hyps = sorted(beam_hyp.beams, key=lambda x: x[0])
+            for j in range(self.num_beam_hyps_to_keep):
+                best_hyp, mems = sorted_hyps.pop()[1:]
+                sent_lengths[self.num_beam_hyps_to_keep * i
+                             + j] = len(best_hyp)
+                best.append((best_hyp, mems))
+
+        # prepare for adding eos
+        sent_max_len = min(sent_lengths.max().item(), self.max_length)
+        decoded: torch.LongTensor = input_ids.new(
+            batch_size * self.num_beam_hyps_to_keep, sent_max_len)
+        # shorter batches are padded if needed
+        if sent_lengths.min().item() != sent_lengths.max().item():
+            assert pad_token_id is not None, '`pad_token_id` has to be defined'
+            decoded.fill_(pad_token_id)
+
+        # fill with hypotheses and eos_token_id if the latter fits in
+        mems = []
+        for i, (hypo, mem) in enumerate(best):
+            decoded[i, :sent_lengths[i]] = hypo
+            if sent_lengths[i] < sent_max_len:
+                decoded[i, sent_lengths[i]] = eos_token_id
+            mems.append(mem)
+        mems = [
+            torch.cat([mem[i] for mem in mems], dim=0)
+            for i in range(len(mems[0]))
+        ] if mems and mems[0] else None
+        return decoded, mems
+
+
+class BeamHypotheses:
+
+    def __init__(self, num_beams: int, max_length: int, length_penalty: float,
+                 early_stopping: bool):
+        """
+        Initialize n-best list of hypotheses.
+        """
+        self.max_length = max_length - 1  # ignoring bos_token
+        self.length_penalty = length_penalty
+        self.early_stopping = early_stopping
+        self.num_beams = num_beams
+        self.beams = []
+        self.worst_score = 1e9
+
+    def __len__(self):
+        """
+        Number of hypotheses in the list.
+        """
+        return len(self.beams)
+
+    def add(self, hyp: torch.LongTensor, sum_logprobs: float, mems=None):
+        """
+        Add a new hypothesis to the list.
+        """
+        score = sum_logprobs / (max(hyp.shape[-1], 1)**self.length_penalty)
+        if len(self) < self.num_beams or score > self.worst_score:
+            self.beams.append((score, hyp, mems))
+            if len(self) > self.num_beams:
+                sorted_next_scores = sorted([
+                    (s, idx) for idx, (s, _, _) in enumerate(self.beams)
+                ])
+                del self.beams[sorted_next_scores[0][1]]
+                self.worst_score = sorted_next_scores[1][0]
+            else:
+                self.worst_score = min(score, self.worst_score)
+
+    def is_done(self, best_sum_logprobs: float, cur_len: int) -> bool:
+        """
+        If there are enough hypotheses and that none of the hypotheses being generated can become better than the worst
+        one in the heap, then we are done with this sentence.
+        """
+
+        if len(self) < self.num_beams:
+            return False
+        elif self.early_stopping:
+            return True
+        else:
+            cur_score = best_sum_logprobs / cur_len**self.length_penalty
+            ret = self.worst_score >= cur_score
+            return ret
+
+
+class LogitsProcessor(ABC):
+    """Abstract base class for all logit processors that can be applied during generation."""
+
+    def __call__(self, input_ids: torch.LongTensor,
+                 scores: torch.FloatTensor) -> torch.FloatTensor:
+        """Torch method for processing logits."""
+        raise NotImplementedError(
+            f'{self.__class__} is an abstract class. Only classes inheriting this class can be called.'
+        )
+
+
+class LogitsProcessorList(list):
+    """
+    This class can be used to create a list of :class:`~transformers.LogitsProcessor` or
+    :class:`~transformers.LogitsWarper` to subsequently process a :obj:`scores` input tensor. This class inherits from
+    list and adds a specific `__call__` method to apply each :class:`~transformers.LogitsProcessor` or
+    :class:`~transformers.LogitsProcessor` to the inputs.
+    """
+
+    def __call__(self, input_ids: torch.LongTensor,
+                 scores: torch.FloatTensor) -> torch.FloatTensor:
+        for processor in self:
+            scores = processor(input_ids, scores)
+        return scores
+
+
+class MinLengthLogitsProcessor(LogitsProcessor):
+    r"""
+    :class:`transformers.LogitsProcessor` enforcing a min-length by setting EOS probability to 0.
+
+    Args:
+        min_length (:obj:`int`):
+            The minimum length below which the score of :obj:`eos_token_id` is set to :obj:`-float("Inf")`.
+        eos_token_id (:obj:`int`):
+            The id of the `end-of-sequence` token.
+    """
+
+    def __init__(self, min_length: int, eos_token_id: int):
+        if not isinstance(min_length, int) or min_length < 0:
+            raise ValueError(
+                f'`min_length` has to be a positive integer, but is {min_length}'
+            )
+
+        if not isinstance(eos_token_id, int) or eos_token_id < 0:
+            raise ValueError(
+                f'`eos_token_id` has to be a positive integer, but is {eos_token_id}'
+            )
+
+        self.min_length = min_length
+        self.eos_token_id = eos_token_id
+
+    def __call__(self, input_ids: torch.LongTensor,
+                 scores: torch.FloatTensor) -> torch.FloatTensor:
+        cur_len = input_ids.shape[-1]
+        if cur_len < self.min_length:
+            scores[:, self.eos_token_id] = -float('inf')
+        return scores
+
+
+class NoRepeatNGramLogitsProcessor(LogitsProcessor):
+    r"""
+    :class:`transformers.LogitsProcessor` that enforces no repetition of n-grams. See `Fairseq
+    <https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345>`__.
+
+    Args:
+        ngram_size (:obj:`int`):
+            All ngrams of size :obj:`ngram_size` can only occur once.
+    """
+
+    def __init__(self, ngram_size: int):
+        if not isinstance(ngram_size, int) or ngram_size <= 0:
+            raise ValueError(
+                f'`ngram_size` has to be a strictly positive integer, but is {ngram_size}'
+            )
+        self.ngram_size = ngram_size
+
+    def __call__(self, input_ids: torch.LongTensor,
+                 scores: torch.FloatTensor) -> torch.FloatTensor:
+        num_batch_hypotheses = scores.shape[0]
+        cur_len = input_ids.shape[-1]
+        banned_batch_tokens = self._calc_banned_ngram_tokens(
+            input_ids, num_batch_hypotheses, cur_len)
+
+        for i, banned_tokens in enumerate(banned_batch_tokens):
+            scores[i, banned_tokens] = -float('inf')
+
+        return scores
+
+    def _calc_banned_ngram_tokens(self, prev_input_ids: torch.Tensor,
+                                  num_hypos: int,
+                                  cur_len: int) -> List[Iterable[int]]:
+        """Copied from fairseq for no_repeat_ngram in beam_search"""
+        if cur_len + 1 < self.ngram_size:
+            # return no banned tokens if we haven't generated no_repeat_ngram_size tokens yet
+            return [[] for _ in range(num_hypos)]
+        generated_ngrams = [{} for _ in range(num_hypos)]
+        for idx in range(num_hypos):
+            gen_tokens = prev_input_ids[idx].tolist()
+            generated_ngram = generated_ngrams[idx]
+            for ngram in zip(*[gen_tokens[i:]
+                               for i in range(self.ngram_size)]):
+                prev_ngram_tuple = tuple(ngram[:-1])
+                generated_ngram[prev_ngram_tuple] = generated_ngram.get(
+                    prev_ngram_tuple, []) + [ngram[-1]]
+
+        def _get_generated_ngrams(hypo_idx):
+            # Before decoding the next token, prevent decoding of ngrams that have already appeared
+            start_idx = cur_len + 1 - self.ngram_size
+            ngram_idx = tuple(prev_input_ids[hypo_idx,
+                                             start_idx:cur_len].tolist())
+            return generated_ngrams[hypo_idx].get(ngram_idx, [])
+
+        banned_tokens = [
+            _get_generated_ngrams(hypo_idx) for hypo_idx in range(num_hypos)
+        ]
+        return banned_tokens
diff --git a/modelscope/models/nlp/mglm/mglm_for_text_summarization.py b/modelscope/models/nlp/mglm/mglm_for_text_summarization.py
new file mode 100644
index 00000000..ea1dfb5a
--- /dev/null
+++ b/modelscope/models/nlp/mglm/mglm_for_text_summarization.py
@@ -0,0 +1,469 @@
+# Copyright (c) 2022 Zhipu.AI
+
+import os
+import random
+from os import path as osp
+from typing import Dict
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.metainfo import Models
+from modelscope.models.base import Tensor, TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.outputs import OutputKeys
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from . import mpu
+from .arguments import get_args
+from .generation_utils import BeamSearchScorer
+from .train_utils import get_model
+from .utils import load_checkpoint
+
+__all__ = ['MGLMForTextSummarization']
+
+
+def setup_args(args):
+    args.block_lm = True
+    args.task_mask = True
+    args.cloze_eval = True
+    args.num_layers = 24
+    args.hidden_size = 1536
+    args.num_attention_heads = 16
+    args.max_position_embeddings = 1024
+    args.tokenizer_type = 'ChineseSPTokenizer'
+    args.load_pretrained = ''
+    args.DDP_impl = 'none'
+    args.model_parallel_size = 1
+    args.fp16 = True
+    args.cache_dir = 'cache'
+    args.out_seq_length = 200
+    args.seq_length = 512
+    args.temperature = 0.9
+    args.top_k = 2
+    args.top_p = 0.8
+    args.frequency_penalty = 0.1
+    args.presence_penalty = 0.1
+    args.mem_length = args.seq_length + args.mem_length - 1
+    return args
+
+
+def setup_model(args):
+    """Setup model and optimizer."""
+
+    model = get_model(args, model_type='generation')
+
+    if args.load_pretrained is not None:
+        args.no_load_optim = True
+        args.load = args.load_pretrained
+        _ = load_checkpoint(model, None, None, args)
+
+    return model
+
+
+def set_random_seed(seed):
+    """Set random seed for reproducability."""
+
+    if seed is not None and seed > 0:
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        mpu.model_parallel_cuda_manual_seed(seed)
+
+
+def get_masks_and_position_ids(data,
+                               eod_token,
+                               reset_position_ids,
+                               reset_attention_mask,
+                               loss_mask=None,
+                               attention_mask=None,
+                               set_loss_mask=False,
+                               mem_length=None):
+    # Extract batch size and sequence length.
+    batch_size, seq_length = data.size()
+
+    # Attention mask (lower triangular).
+    if mem_length:
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                (1, seq_length, seq_length + mem_length), device=data.device)
+        attention_mask = torch.tril(
+            torch.triu(attention_mask, 1 - seq_length + mem_length),
+            mem_length)
+    else:
+        if reset_attention_mask:
+            att_mask_batch = batch_size
+        else:
+            att_mask_batch = 1
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                (att_mask_batch, seq_length, seq_length), device=data.device)
+        attention_mask = torch.tril(attention_mask)
+    attention_mask = attention_mask.unsqueeze(1)
+
+    # Loss mask.
+    if loss_mask is None:
+        loss_mask = torch.ones(
+            data.size(), dtype=torch.float, device=data.device)
+
+    # Position ids.
+    position_ids = torch.arange(
+        seq_length, dtype=torch.long, device=data.device)
+    position_ids = position_ids.unsqueeze(0).expand_as(data)
+    if set_loss_mask:
+        loss_mask[data == eod_token] = 0.0
+    # We need to clone as the ids will be modifed based on batch index.
+    if reset_position_ids:
+        position_ids = position_ids.clone()
+
+    if reset_position_ids or reset_attention_mask:
+        # Loop through the batches:
+        for b in range(batch_size):
+
+            # Find indecies where EOD token is.
+            eod_index = position_ids[b, data[b] == eod_token]
+            # Detach indecies from positions if going to modify positions.
+            if reset_position_ids:
+                eod_index = eod_index.clone()
+
+            # Loop through EOD indecies:
+            prev_index = 0
+            for j in range(eod_index.size()[0]):
+                i = eod_index[j]
+                # Mask attention loss.
+                if reset_attention_mask:
+                    attention_mask[b, 0, (i + 1):, :(i + 1)] = 0
+                # Reset positions.
+                if reset_position_ids:
+                    position_ids[b, (i + 1):] -= (i + 1 - prev_index)
+                    prev_index = i + 1
+
+    return attention_mask, loss_mask, position_ids
+
+
+def initialize_distributed(args):
+    """Initialize torch.distributed."""
+
+    # Manually set the device ids.
+    device = args.rank % torch.cuda.device_count()
+    if args.local_rank is not None:
+        device = args.local_rank
+    torch.cuda.set_device(device)
+    # Call the init process
+    init_method = 'tcp://'
+    args.master_ip = os.getenv('MASTER_ADDR', 'localhost')
+    args.master_port = os.getenv('MASTER_PORT', '6000')
+    init_method += args.master_ip + ':' + args.master_port
+    torch.distributed.init_process_group(
+        backend=args.distributed_backend,
+        world_size=args.world_size,
+        rank=args.rank,
+        init_method=init_method)
+
+    # Set the model-parallel / data-parallel communicators.
+    mpu.initialize_model_parallel(args.model_parallel_size)
+
+    # Optional DeepSpeed Activation Checkpointing Features
+    #
+    if hasattr(
+            args, 'deepspeed'
+    ) and args.deepspeed and args.deepspeed_activation_checkpointing:
+        set_deepspeed_activation_checkpointing(args)
+
+
+def get_batch(context_tokens, device, args):
+    tokens = context_tokens
+    tokens = tokens.view(args.batch_size, -1).contiguous()
+    tokens = tokens.to(device)
+
+    # Get the masks and postition ids.
+    if args.block_lm:
+        attention_mask = torch.tensor([tokens.size(1)],
+                                      device=device,
+                                      dtype=torch.long)
+        position_ids = torch.arange(
+            tokens.size(1), device=device, dtype=torch.long)
+        if not args.no_block_position:
+            block_position_ids = torch.zeros(
+                tokens.size(1), device=device, dtype=torch.long)
+            position_ids = torch.stack((position_ids, block_position_ids),
+                                       dim=0)
+        position_ids = position_ids.unsqueeze(0)
+    else:
+        attention_mask, loss_mask, position_ids = get_masks_and_position_ids(
+            tokens,
+            args.eod_token,
+            reset_position_ids=False,
+            reset_attention_mask=False,
+            set_loss_mask=False,
+            mem_length=args.mem_length)
+
+    return tokens, attention_mask, position_ids
+
+
+def top_k_logits(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
+    # This function has been mostly taken from huggingface conversational ai code at
+    # https://medium.com/huggingface/how-to-build-a-state-of-the-art-conversational-ai-with-transfer-learning-2d818ac26313
+
+    if top_k > 0:
+        # Remove all tokens with a probability less than the last token of the top-k
+        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1,
+                                                                  None]
+        logits[indices_to_remove] = filter_value
+
+    if top_p > 0.0:
+        # convert to 1D
+        logits = logits.view(logits.size()[1]).contiguous()
+        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+        cumulative_probs = torch.cumsum(
+            F.softmax(sorted_logits, dim=-1), dim=-1)
+
+        # Remove tokens with cumulative probability above the threshold
+        sorted_indices_to_remove = cumulative_probs > top_p
+        # Shift the indices to the right to keep also the first token above the threshold
+        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
+            ..., :-1].clone()
+        sorted_indices_to_remove[..., 0] = 0
+        indices_to_remove = sorted_indices[sorted_indices_to_remove]
+        logits[indices_to_remove] = filter_value
+        # going back to 2D
+        logits = logits.view(1, -1).contiguous()
+
+    return logits
+
+
+def sample_sequence(model,
+                    tokenizer,
+                    context_tokens,
+                    context_length,
+                    args,
+                    device,
+                    mems=None,
+                    end_tokens=None):
+    if not args.block_lm:
+        context_tokens, attention_mask, position_ids = get_batch(
+            context_tokens, device, args)
+        tokens = torch.empty((args.num_beams, 0),
+                             device=context_tokens.device,
+                             dtype=torch.long)
+    else:
+        tokens = context_tokens.new_full((1, 1),
+                                         tokenizer.get_command('sop').Id)
+    counter = 0
+    if mems is None:
+        mems = []
+    if end_tokens is None:
+        end_tokens = [args.eod_token]
+
+    last_beam_num = 1
+    output_tokens_list = []
+    generated_tokens_list = []
+
+    while counter < args.out_seq_length:
+        if counter == 0 and not args.block_lm:
+            next_token_logits, *mems = model(context_tokens, position_ids,
+                                             attention_mask, *mems)
+        else:
+            if args.block_lm:
+                if args.no_block_position:
+                    position_ids = context_tokens.new_full(
+                        (last_beam_num, 1), context_length + counter)
+                else:
+                    position_ids = context_tokens.new_ones(last_beam_num, 2, 1)
+                    position_ids[:, 0] = context_length
+                    position_ids[:, 1] = counter + 1
+                attention_mask = context_tokens.new_zeros(
+                    [1], device=context_tokens.device, dtype=torch.long)
+            else:
+                position_ids = context_tokens.new_ones((last_beam_num, 1)) * (
+                    context_length + counter - 1)
+                attention_mask = context_tokens.new_ones(
+                    last_beam_num,
+                    1,
+                    1,
+                    args.mem_length + 1,
+                    device=context_tokens.device,
+                    dtype=torch.float)
+            last_token = tokens[:, -1:]
+            next_token_logits, *mems = model(last_token, position_ids,
+                                             attention_mask, *mems)
+        next_token_logits = next_token_logits[:, -1]
+
+        next_token_logits /= args.temperature
+        frequency_count = torch.zeros(next_token_logits.shape)
+        for tk in output_tokens_list:
+            frequency_count[0][tk] += 1
+
+        next_token_logits -= (args.frequency_penalty
+                              * frequency_count).to(device)
+        next_token_logits -= (
+            args.presence_penalty *  # noqa
+            (frequency_count > 0)).to(device)
+
+        next_token_logits = top_k_logits(
+            next_token_logits, top_k=args.top_k, top_p=args.top_p)
+        log_probs = F.softmax(next_token_logits, dim=-1)
+        prev = torch.multinomial(log_probs, num_samples=1)[0]
+        is_end = prev.item() in end_tokens
+        if is_end:
+            break
+        decode_tokens = tokenizer.DecodeIds([prev.item()])  # noqa
+        generated_tokens_list.append(prev.item())
+        prev = prev.view(1, 1)
+        tokens = prev if tokens is None else torch.cat((tokens, prev), dim=1)
+        counter += 1
+        output_tokens_list = tokens.view(-1).contiguous()
+    return torch.cat((context_tokens, tokens), dim=1), mems
+
+
+def read_context(tokenizer, args, context):
+    terminate_runs, skip_run = 0, 0  # noqa
+    if mpu.get_model_parallel_rank() == 0:
+        while True:
+            # raw_text = input("\nContext prompt (stop to exit) >>> ")
+            raw_text = context
+            if not raw_text:
+                print('Prompt should not be empty!')
+                break
+            # if raw_text == "stop":
+            #     terminate_runs = 1
+            #     break
+            generation_mask = '[gMASK]' if args.task_mask else '[MASK]'
+            if args.block_lm and 'MASK]' not in raw_text:
+                raw_text += ' ' + generation_mask
+            # output.write(raw_text)
+            context_tokens = tokenizer.EncodeAsIds(raw_text).tokenization
+            if args.block_lm:
+                context_tokens = [tokenizer.get_command('ENC').Id
+                                  ] + context_tokens
+                if not raw_text.endswith('[gMASK]'):
+                    context_tokens = context_tokens + [
+                        tokenizer.get_command('eos').Id
+                    ]
+            context_length = len(context_tokens)
+
+            if context_length >= args.seq_length:
+                print('\nContext length', context_length,
+                      '\nPlease give smaller context than the window length!')
+                break
+            break
+    else:
+        context_length = 0
+
+    terminate_runs_tensor = torch.cuda.LongTensor([terminate_runs])
+    torch.distributed.broadcast(
+        terminate_runs_tensor,
+        mpu.get_model_parallel_src_rank(),
+        group=mpu.get_model_parallel_group())
+    terminate_runs = terminate_runs_tensor[0].item()
+
+    if terminate_runs == 1:
+        return terminate_runs, None, None, None
+
+    context_length_tensor = torch.cuda.LongTensor([context_length])
+
+    torch.distributed.broadcast(
+        context_length_tensor,
+        mpu.get_model_parallel_src_rank(),
+        group=mpu.get_model_parallel_group())
+    context_length = context_length_tensor[0].item()
+    if mpu.get_model_parallel_rank() == 0:
+        context_tokens_tensor = torch.cuda.LongTensor(context_tokens)
+    else:
+        context_tokens_tensor = torch.cuda.LongTensor([0] * context_length)
+    torch.distributed.broadcast(
+        context_tokens_tensor,
+        mpu.get_model_parallel_src_rank(),
+        group=mpu.get_model_parallel_group())
+    if mpu.get_model_parallel_rank() != 0:
+        raw_text = tokenizer.DecodeIds(context_tokens_tensor.tolist())
+    return terminate_runs, raw_text, context_tokens_tensor, context_length
+
+
+@MODELS.register_module(Tasks.text_summarization, module_name=Models.mglm)
+class MGLMForTextSummarization(TorchModel):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the text summarization model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+        """
+        super().__init__(model_dir, *args, **kwargs)
+
+        from .configure_data import prepare_tokenizer
+        # Disable CuDNN.
+        torch.backends.cudnn.enabled = False
+        # Arguments.
+        self.args = setup_args(get_args())
+        self.args.load_pretrained = model_dir
+        # Pytorch distributed.
+        try:
+            initialize_distributed(self.args)
+        except (RuntimeError):
+            print('group process initialized twice')
+        # Random seeds for reproducability.
+        set_random_seed(self.args.seed)
+        # setting default batch size to 1
+        self.args.batch_size = 1
+        self.args.tokenizer_path = model_dir
+        self.tokenizer = prepare_tokenizer(self.args)
+        self.model = setup_model(self.args)
+        self.cfg = Config.from_file(
+            osp.join(model_dir, ModelFile.CONFIGURATION))
+
+    def forward(self, input: Dict[str, str]) -> Dict[str, str]:
+        pass
+
+    def generate(self, input: Dict[str, str]) -> Dict[str, str]:
+        model = self.model
+        tokenizer = self.tokenizer
+        args = self.args
+        device = torch.cuda.current_device()
+        model.eval()
+
+        context = input['text'] + self.cfg.model.prompt
+        with torch.no_grad():
+            terminate_runs, raw_text, context_tokens_tensor, context_length = read_context(
+                tokenizer, args, context)
+            mems = []
+            tokens, attention_mask, position_ids = get_batch(
+                context_tokens_tensor, device, args)
+            mask_tokens = ['MASK', 'sMASK', 'gMASK'
+                           ] if args.task_mask else ['MASK']
+            mask_tokens = [
+                tokenizer.get_command(token).Id for token in mask_tokens
+            ]
+            end_tokens = [tokenizer.get_command('eop').Id, args.eod_token]
+
+            mask_positions = []
+            for token in mask_tokens:
+                mask_positions += (context_tokens_tensor == token).nonzero(
+                    as_tuple=True)[0].tolist()
+            mask_positions.sort()
+            if args.no_block_position:
+                for mask_position in mask_positions:
+                    position_ids[0, mask_position + 1:] += args.out_seq_length
+            _, *mems = model(tokens, position_ids, attention_mask, *mems)
+            for mask_position in mask_positions:
+                if args.no_block_position:
+                    position = position_ids[0, mask_position].item()
+                else:
+                    position = mask_position
+                tokens, mems, = sample_sequence(
+                    model,
+                    tokenizer,
+                    tokens,
+                    position,
+                    args,
+                    device,
+                    mems=mems,
+                    end_tokens=end_tokens)
+            output_tokens_list = tokens.view(-1).contiguous()
+            trim_decode_tokens = tokenizer.DecodeIds(
+                output_tokens_list.tolist())
+            res = trim_decode_tokens.split('<|startofpiece|>')[-1]
+            print(res)
+        return {OutputKeys.TEXT: res}
diff --git a/modelscope/models/nlp/mglm/model/__init__.py b/modelscope/models/nlp/mglm/model/__init__.py
new file mode 100755
index 00000000..84c55ae3
--- /dev/null
+++ b/modelscope/models/nlp/mglm/model/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .distributed import (DistributedDataParallel,
+                          PyTorchDistributedDataParallel)
+from .downstream import (GLMForMultiTokenCloze, GLMForMultiTokenClozeFast,
+                         GLMForSequenceClassification, GLMForSingleTokenCloze)
+from .modeling_glm import (GLMModel,
+                           glm_get_params_for_weight_decay_optimization)
diff --git a/modelscope/models/nlp/mglm/model/distributed.py b/modelscope/models/nlp/mglm/model/distributed.py
new file mode 100755
index 00000000..a3c84e9f
--- /dev/null
+++ b/modelscope/models/nlp/mglm/model/distributed.py
@@ -0,0 +1,127 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.distributed as dist
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+from torch.autograd import Variable
+from torch.nn.modules import Module
+from torch.nn.parallel.distributed import DistributedDataParallel as DDP
+
+from modelscope.models.nlp.mglm import mpu
+
+
+class PyTorchDistributedDataParallel(DDP):
+
+    def named_parameters(self, prefix: str = '', recurse: bool = True):
+        return self.module.named_parameters(prefix=prefix, recurse=recurse)
+
+    def state_dict(self, destination=None, prefix='', keep_vars=False):
+        sd = self.module.state_dict(destination, prefix, keep_vars)
+        return sd
+
+    def load_state_dict(self, state_dict, strict=True):
+        return self.module.load_state_dict(state_dict, strict=strict)
+
+
+class DistributedDataParallel(Module):
+
+    def __init__(self, module):
+        super(DistributedDataParallel, self).__init__()
+        self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
+
+        self.module = module
+        self.data_parallel_group = mpu.get_data_parallel_group()
+        src_rank = mpu.get_model_parallel_rank()
+        for p in self.module.parameters():
+            if torch.is_tensor(p):
+                dist.broadcast(p, src_rank, group=self.data_parallel_group)
+
+        def allreduce_params(reduce_after=True,
+                             no_scale=False,
+                             fp32_allreduce=False):
+            if (self.needs_reduction):
+                self.needs_reduction = False
+                buckets = {}
+                for name, param in self.module.named_parameters():
+                    if param.requires_grad and param.grad is not None:
+                        tp = (param.data.type())
+                        if tp not in buckets:
+                            buckets[tp] = []
+                        buckets[tp].append(param)
+                if self.warn_on_half:
+                    if torch.cuda.HalfTensor in buckets:
+                        print(
+                            'WARNING: gloo dist backend for half parameters may be extremely slow. It is recommended to use the NCCL backend in this case.'  # noqa
+                        )
+                        self.warn_on_half = False
+                for tp in buckets:
+                    bucket = buckets[tp]
+                    grads = [param.grad.data for param in bucket]
+                    coalesced = _flatten_dense_tensors(grads)
+                    if fp32_allreduce:
+                        coalesced = coalesced.float()
+                    if not no_scale and not reduce_after:
+                        coalesced /= dist.get_world_size(
+                            group=self.data_parallel_group)
+                    dist.all_reduce(coalesced, group=self.data_parallel_group)
+                    torch.cuda.synchronize()
+                    if not no_scale and reduce_after:
+                        coalesced /= dist.get_world_size(
+                            group=self.data_parallel_group)
+                    for buf, synced in zip(
+                            grads, _unflatten_dense_tensors(coalesced, grads)):
+                        buf.copy_(synced)
+
+        self.hook_handles = []
+        self.hooks = []
+        for param in list(self.module.parameters()):
+
+            def allreduce_hook(*unused):
+                Variable._execution_engine.queue_callback(allreduce_params)
+
+        self.allreduce_params = allreduce_params
+
+    def forward(self, *inputs, **kwargs):
+        self.needs_reduction = True
+        return self.module(*inputs, **kwargs)
+
+    def state_dict(self, destination=None, prefix='', keep_vars=False):
+        sd = self.module.state_dict(destination, prefix, keep_vars)
+        return sd
+
+    def load_state_dict(self, state_dict, strict=True):
+        return self.module.load_state_dict(state_dict, strict=strict)
+
+    def named_parameters(self, prefix: str = '', recurse: bool = True):
+        return self.module.named_parameters(prefix=prefix, recurse=recurse)
+
+    '''
+    def _sync_buffers(self):
+        buffers = list(self.module._all_buffers())
+        if len(buffers) > 0:
+            # cross-node buffer sync
+            flat_buffers = _flatten_dense_tensors(buffers)
+            dist.broadcast(flat_buffers, 0)
+            for buf, synced in zip(buffers, _unflatten_dense_tensors(flat_buffers, buffers)):
+                buf.copy_(synced)
+    def train(self, mode=True):
+        # Clear NCCL communicator and CUDA event cache of the default group ID,
+        # These cache will be recreated at the later call. This is currently a
+        # work-around for a potential NCCL deadlock.
+        if dist._backend == dist.dist_backend.NCCL:
+            dist._clear_group_cache()
+        super(DistributedDataParallel, self).train(mode)
+        self.module.train(mode)
+    '''
diff --git a/modelscope/models/nlp/mglm/model/downstream.py b/modelscope/models/nlp/mglm/model/downstream.py
new file mode 100644
index 00000000..61b1e807
--- /dev/null
+++ b/modelscope/models/nlp/mglm/model/downstream.py
@@ -0,0 +1,242 @@
+# Copyright (c) 2022 Zhipu.AI
+"""Multiple choice model."""
+
+import torch
+import torch.nn
+
+from .modeling_glm import GLMModel
+
+
+class GLMForMultiTokenCloze(torch.nn.Module):
+
+    def __init__(self,
+                 language_model: GLMModel,
+                 take_softmax=True,
+                 length_penalty=0.0):
+        super(GLMForMultiTokenCloze, self).__init__()
+        self.model = language_model
+        self.take_softmax = take_softmax
+        self.length_penalty = length_penalty
+
+    def state_dict(self, destination=None, prefix='', keep_vars=False):
+        # [h.remove() for h in self.hook_handles]
+        sd = self.model.state_dict(destination, prefix, keep_vars)
+        return sd
+
+    def load_state_dict(self, state_dict, strict=True):
+        return self.model.load_state_dict(state_dict, strict=strict)
+
+    def named_parameters(self, prefix: str = '', recurse: bool = True):
+        return self.model.named_parameters(prefix=prefix, recurse=recurse)
+
+    def forward(self,
+                input_ids,
+                position_ids,
+                attention_mask,
+                target_ids=None,
+                logit_mask=None,
+                prompt_pos=None):
+        if target_ids is None:
+            return self.model(input_ids, position_ids, attention_mask)
+        num_choices = None
+        if len(input_ids.shape) == 3:
+            batch_size, num_choices = input_ids.shape[:2]
+            input_ids = input_ids.reshape(-1, input_ids.size(-1))
+            attention_mask = attention_mask.reshape(-1,
+                                                    *attention_mask.size()[2:])
+            position_ids = position_ids.reshape(-1, *position_ids.size()[2:])
+            target_ids = target_ids.reshape(-1, target_ids.size(-1))
+            logit_mask = logit_mask.reshape(-1, logit_mask.size(-1))
+            if prompt_pos is not None:
+                prompt_pos = prompt_pos.reshape(-1, prompt_pos.size(-1))
+        outputs, *mems = self.model(
+            input_ids, position_ids, attention_mask, prompt_pos=prompt_pos)
+        if self.take_softmax:
+            outputs = torch.nn.functional.log_softmax(outputs, dim=-1)
+        # select the target logits
+        batch_ids = torch.arange(
+            target_ids.size(0), dtype=torch.long, device=target_ids.device)
+        batch_ids = batch_ids.unsqueeze(1).expand_as(target_ids)
+        seq_ids = torch.arange(
+            target_ids.size(-1), dtype=torch.long, device=target_ids.device)
+        seq_ids = seq_ids.unsqueeze(0).expand_as(target_ids)
+        logits = outputs[batch_ids, seq_ids, target_ids]
+        logits = (logits * logit_mask).sum(dim=1)
+        if self.length_penalty > 0.0:
+            logits = logits / logit_mask.sum(dim=1)**self.length_penalty
+        if num_choices is not None:
+            logits = logits.view(-1, num_choices)
+        return (logits, *mems)
+
+
+class GLMForMultiTokenClozeFast(torch.nn.Module):
+
+    def __init__(self, language_model, take_softmax=True, length_penalty=0.0):
+        super(GLMForMultiTokenClozeFast, self).__init__()
+        self.model = language_model
+        self.take_softmax = take_softmax
+        self.length_penalty = length_penalty
+
+    def forward(self, input_ids, position_ids, attention_mask, dec_input_ids,
+                dec_position_ids, dec_attention_mask, dec_target_ids,
+                dec_logit_mask):
+        # encoder
+        outputs, *mems = self.model(
+            input_ids,
+            position_ids,
+            attention_mask,
+            return_memory=True,
+            detach_memory=False)
+        batch_size, num_choices, max_dec_len = dec_input_ids.size()
+        max_enc_len = input_ids.size(-1)
+
+        enc_mems = []
+        for hidden in mems:
+            hidden = hidden.unsqueeze(1).expand(-1, num_choices, -1,
+                                                -1).reshape(
+                                                    batch_size * num_choices,
+                                                    *hidden.size()[1:])
+            enc_mems.append(hidden)
+
+        def build_dec_mask_matrix(seq_length, sep, memory_length=0):
+            m = enc_mems[0].new_ones((1, seq_length, seq_length))
+            m = torch.tril(m)
+
+            # sep = dec_attention_mask
+            ids = torch.arange(
+                memory_length, device=sep.device, dtype=sep.dtype).view(1, -1)
+            mask = ids < sep.view(-1, 1)  # batch * mem
+            mask = mask.unsqueeze(1).float().expand(-1, seq_length, -1)
+
+            m = m.expand(batch_size * num_choices, -1, -1)
+            m = torch.cat((mask, m), dim=2)
+            m = m.unsqueeze(1)
+            return m
+
+        dec_input_ids = dec_input_ids.reshape(-1, max_dec_len)
+        dec_position_ids = dec_position_ids.reshape(
+            -1,
+            *dec_position_ids.size()[2:])
+        # dec_attention_mask = dec_attention_mask.reshape(-1, *dec_attention_mask.size()[2:]).unsqueeze(1)
+        dec_attention_mask = build_dec_mask_matrix(
+            max_dec_len, dec_attention_mask.reshape(-1), max_enc_len)
+        dec_target_ids = dec_target_ids.reshape(-1, dec_target_ids.size(-1))
+        dec_logit_mask = dec_logit_mask.reshape(-1, dec_logit_mask.size(-1))
+
+        outputs, *mems = self.model(dec_input_ids, dec_position_ids,
+                                    dec_attention_mask, *enc_mems)
+        if self.take_softmax:
+            outputs = torch.nn.functional.log_softmax(outputs, dim=-1)
+
+        batch_ids = torch.arange(
+            dec_target_ids.size(0),
+            dtype=torch.long,
+            device=dec_target_ids.device)
+        batch_ids = batch_ids.unsqueeze(1).expand_as(dec_target_ids)
+        seq_ids = torch.arange(
+            dec_target_ids.size(-1),
+            dtype=torch.long,
+            device=dec_target_ids.device)
+        seq_ids = seq_ids.unsqueeze(0).expand_as(dec_target_ids)
+        logits = outputs[batch_ids, seq_ids, dec_target_ids]
+        logits = (logits * dec_logit_mask).sum(dim=1)
+        if self.length_penalty > 0.0:
+            logits = logits / dec_logit_mask.sum(dim=1)**self.length_penalty
+        if num_choices is not None:
+            logits = logits.view(-1, num_choices)
+        return (logits, *mems)
+
+
+class GLMForSingleTokenCloze(torch.nn.Module):
+
+    def __init__(self, language_model, take_softmax=False):
+        super().__init__()
+        self.model = language_model
+        self.take_softmax = take_softmax
+
+    def state_dict(self, destination=None, prefix='', keep_vars=False):
+        # [h.remove() for h in self.hook_handles]
+        sd = self.model.state_dict(destination, prefix, keep_vars)
+        return sd
+
+    def load_state_dict(self, state_dict, strict=True):
+        return self.model.load_state_dict(state_dict, strict=strict)
+
+    def named_parameters(self, prefix: str = '', recurse: bool = True):
+        return self.model.named_parameters(prefix=prefix, recurse=recurse)
+
+    def forward(self,
+                input_ids,
+                position_ids,
+                attention_mask,
+                target_ids=None,
+                logit_mask=None,
+                prompt_pos=None):
+        if target_ids is None:
+            return self.model(input_ids, position_ids, attention_mask)
+        assert len(input_ids.shape) == 2
+        outputs, *mems = self.model(
+            input_ids, position_ids, attention_mask, prompt_pos=prompt_pos)
+        batch_ids = torch.arange(
+            outputs.size(0),
+            dtype=attention_mask.dtype,
+            device=attention_mask.device)
+        target_logits = outputs[batch_ids, attention_mask]
+        if self.take_softmax:
+            target_prob = torch.nn.functional.log_softmax(
+                target_logits, dim=-1)
+        else:
+            target_prob = target_logits
+        batch_ids = batch_ids.unsqueeze(1).expand_as(target_ids)
+        output = target_prob[batch_ids, target_ids]
+
+        return (output, target_logits, *mems)
+
+
+class GLMForSequenceClassification(torch.nn.Module):
+
+    def __init__(self,
+                 language_model,
+                 hidden_size,
+                 hidden_dropout,
+                 pool_token,
+                 num_class=1):
+        super().__init__()
+        self.pool_token = pool_token
+        self.model = language_model
+        self.num_class = num_class
+        # Multi-choice head.
+        self.pool_layer = torch.nn.Linear(hidden_size, hidden_size)
+        self.multichoice_dropout = torch.nn.Dropout(hidden_dropout)
+        self.multichoice_head = torch.nn.Linear(hidden_size, num_class)
+
+    def forward(self, input_ids, position_ids, attention_mask):
+        num_choices = None
+        if len(input_ids.shape) == 3:
+            assert self.num_class == 1
+            batch_size, num_choices = input_ids.shape[:2]
+            input_ids = input_ids.reshape(-1, input_ids.size(-1))
+            attention_mask = attention_mask.reshape(-1,
+                                                    *attention_mask.size()[2:])
+            position_ids = position_ids.reshape(-1, *position_ids.size()[2:])
+        outputs, *mems = self.model(input_ids, position_ids, attention_mask)
+        if self.pool_token == 'start':
+            output = outputs[torch.arange(
+                outputs.size(0),
+                dtype=attention_mask.dtype,
+                device=attention_mask.device), attention_mask]
+        elif self.pool_token == 'pad':
+            output = outputs[torch.arange(
+                outputs.size(0),
+                dtype=attention_mask.dtype,
+                device=attention_mask.device), attention_mask - 1]
+        elif self.pool_token == 'cls':
+            output = outputs[:, 0]
+        else:
+            raise NotImplementedError
+        output = torch.tanh(self.pool_layer(output))
+        multichoice_output = self.multichoice_dropout(output)
+        logits = self.multichoice_head(multichoice_output)
+        if num_choices is not None:
+            logits = logits.view(-1, num_choices)
+        return (logits, *mems)
diff --git a/modelscope/models/nlp/mglm/model/modeling_bert.py b/modelscope/models/nlp/mglm/model/modeling_bert.py
new file mode 100644
index 00000000..965f82a7
--- /dev/null
+++ b/modelscope/models/nlp/mglm/model/modeling_bert.py
@@ -0,0 +1,1576 @@
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch BERT model."""
+
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+import copy
+import logging
+import math
+import os
+import shutil
+import tarfile
+import tempfile
+
+import json
+import mpu
+import torch
+import torch.nn.functional as F
+from data_utils.file_utils import cached_path
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+# from torch.utils.checkpoint import checkpoint
+
+
+def normal_init_method(mean, std):
+
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=mean, std=std)
+
+    return init_
+
+
+def scaled_init_method(mean, std, num_layers):
+    """Init method based on N(0, sigma/sqrt(2*num_layers)."""
+    std = std / math.sqrt(2.0 * num_layers)
+
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=mean, std=std)
+
+    return init_
+
+
+def bert_extended_attention_mask(attention_mask):
+    # We create a 3D attention mask from a 2D tensor mask.
+    # [b, 1, s]
+    attention_mask_b1s = attention_mask.unsqueeze(1)
+    # [b, s, 1]
+    attention_mask_bs1 = attention_mask.unsqueeze(2)
+    # [b, s, s]
+    attention_mask_bss = attention_mask_b1s * attention_mask_bs1
+    # [b, 1, s, s]
+    extended_attention_mask = attention_mask_bss.unsqueeze(1)
+
+    return extended_attention_mask
+
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+PRETRAINED_MODEL_ARCHIVE_MAP = {
+    'bert-base-uncased':
+    '/root/data/bert-base-uncased.tar.gz',
+    'bert-large-uncased':
+    'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased.tar.gz',
+    'bert-base-cased':
+    'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased.tar.gz',
+    'bert-large-cased':
+    'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased.tar.gz',
+    'bert-base-multilingual-uncased':
+    'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased.tar.gz',
+    'bert-base-multilingual-cased':
+    'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased.tar.gz',
+    'bert-base-chinese':
+    'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz',
+}
+CONFIG_NAME = 'bert_config.json'
+WEIGHTS_NAME = 'pytorch_model.bin'
+TF_WEIGHTS_NAME = 'model.ckpt'
+
+
+def load_tf_weights_in_bert(model, tf_checkpoint_path):
+    """ Load tf checkpoints in a pytorch model
+    """
+    try:
+        import re
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        print(
+            'Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see '
+            'https://www.tensorflow.org/install/ for installation instructions.'
+        )
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    print('Converting TensorFlow checkpoint from {}'.format(tf_path))
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        print('Loading TF weight {} with shape {}'.format(name, shape))
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+
+    for name, array in zip(names, arrays):
+        name = name.split('/')
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(n in ['adam_v', 'adam_m'] for n in name):
+            print('Skipping {}'.format('/'.join(name)))
+            continue
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
+                l = re.split(r'_(\d+)', m_name)  # noqa
+            else:
+                l = [m_name]  # noqa
+            if l[0] == 'kernel' or l[0] == 'gamma':
+                pointer = getattr(pointer, 'weight')
+            elif l[0] == 'output_bias' or l[0] == 'beta':
+                pointer = getattr(pointer, 'bias')
+            elif l[0] == 'output_weights':
+                pointer = getattr(pointer, 'weight')
+            else:
+                pointer = getattr(pointer, l[0])
+            if len(l) >= 2:
+                num = int(l[1])
+                pointer = pointer[num]
+        if m_name[-11:] == '_embeddings':
+            pointer = getattr(pointer, 'weight')
+        elif m_name == 'kernel':
+            array = np.transpose(array)
+        try:
+            assert pointer.shape == array.shape
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        print('Initialize PyTorch weight {}'.format(name))
+        pointer.data = torch.from_numpy(array)
+    return model
+
+
+def gelu(x):
+    """Implementation of the gelu activation function.
+        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
+        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+    """
+    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
+
+
+def swish(x):
+    return x * torch.sigmoid(x)
+
+
+ACT2FN = {'gelu': gelu, 'relu': torch.nn.functional.relu, 'swish': swish}
+
+
+class BertConfig(object):
+    """Configuration class to store the configuration of a `BertModel`.
+    """
+
+    def __init__(self,
+                 vocab_size_or_config_json_file,
+                 hidden_size=768,
+                 num_hidden_layers=12,
+                 num_attention_heads=12,
+                 intermediate_size=3072,
+                 hidden_act='gelu',
+                 hidden_dropout_prob=0.1,
+                 attention_probs_dropout_prob=0.1,
+                 max_position_embeddings=512,
+                 type_vocab_size=2,
+                 initializer_range=0.02,
+                 deep_init=False,
+                 fp32_layernorm=False,
+                 fp32_embedding=False,
+                 fp32_tokentypes=False,
+                 layernorm_epsilon=1e-12):
+        """Constructs BertConfig.
+
+        Args:
+            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
+            hidden_size: Size of the encoder layers and the pooler layer.
+            num_hidden_layers: Number of hidden layers in the Transformer encoder.
+            num_attention_heads: Number of attention heads for each attention layer in
+                the Transformer encoder.
+            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
+                layer in the Transformer encoder.
+            hidden_act: The non-linear activation function (function or string) in the
+                encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
+            hidden_dropout_prob: The dropout probabilitiy for all fully connected
+                layers in the embeddings, encoder, and pooler.
+            attention_probs_dropout_prob: The dropout ratio for the attention
+                probabilities.
+            max_position_embeddings: The maximum sequence length that this model might
+                ever be used with. Typically set this to something large just in case
+                (e.g., 512 or 1024 or 2048).
+            type_vocab_size: The vocabulary size of the `token_type_ids` passed into
+                `BertModel`.
+            initializer_range: The sttdev of the truncated_normal_initializer for
+                initializing all weight matrices.
+        """
+        if isinstance(vocab_size_or_config_json_file, str):
+            with open(
+                    vocab_size_or_config_json_file, 'r',
+                    encoding='utf-8') as reader:
+                json_config = json.loads(reader.read())
+            for key, value in json_config.items():
+                self.__dict__[key] = value
+        elif isinstance(vocab_size_or_config_json_file, int):
+            self.vocab_size = vocab_size_or_config_json_file
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.hidden_act = hidden_act
+            self.intermediate_size = intermediate_size
+            self.hidden_dropout_prob = hidden_dropout_prob
+            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+            self.max_position_embeddings = max_position_embeddings
+            self.type_vocab_size = type_vocab_size
+            self.initializer_range = initializer_range
+            self.deep_init = deep_init
+            self.fp32_layernorm = fp32_layernorm
+            self.fp32_embedding = fp32_embedding
+            self.layernorm_epsilon = layernorm_epsilon
+            self.fp32_tokentypes = fp32_tokentypes
+        else:
+            raise ValueError(
+                'First argument must be either a vocabulary size (int)'
+                'or the path to a pretrained model config file (str)')
+
+    @classmethod
+    def from_dict(cls, json_object):
+        """Constructs a `BertConfig` from a Python dictionary of parameters."""
+        config = BertConfig(vocab_size_or_config_json_file=-1)
+        for key, value in json_object.items():
+            config.__dict__[key] = value
+        return config
+
+    @classmethod
+    def from_json_file(cls, json_file):
+        """Constructs a `BertConfig` from a json file of parameters."""
+        with open(json_file, 'r', encoding='utf-8') as reader:
+            text = reader.read()
+        return cls.from_dict(json.loads(text))
+
+    def __repr__(self):
+        return str(self.to_json_string())
+
+    def to_dict(self):
+        """Serializes this instance to a Python dictionary."""
+        output = copy.deepcopy(self.__dict__)
+        return output
+
+    def to_json_string(self):
+        """Serializes this instance to a JSON string."""
+        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + '\n'
+
+
+try:
+    from apex.normalization.fused_layer_norm import FusedLayerNorm as BertLayerNorm
+except ImportError:
+    print(
+        'Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.'
+    )
+
+    class BertLayerNorm(nn.Module):
+
+        def __init__(self, hidden_size, eps=1e-12):
+            """Construct a layernorm module in the TF style (epsilon inside the square root).
+            """
+            super(BertLayerNorm, self).__init__()
+            self.weight = nn.Parameter(torch.ones(hidden_size))
+            self.bias = nn.Parameter(torch.zeros(hidden_size))
+            self.variance_epsilon = eps
+
+        def forward(self, x):
+            u = x.mean(-1, keepdim=True)
+            s = (x - u).pow(2).mean(-1, keepdim=True)
+            x = (x - u) / torch.sqrt(s + self.variance_epsilon)
+            return self.weight * x + self.bias
+
+
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings.
+    """
+
+    def __init__(self, config):
+        super(BertEmbeddings, self).__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size,
+                                            config.hidden_size)
+        # self.word_embeddings = mpu.VocabParallelEmbedding(
+        #     config.vocab_size, config.hidden_size,
+        #     init_method=normal_init_method(mean=0.0,
+        #                                    std=config.initializer_range))
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
+                                                config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
+                                                  config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.fp32_layernorm = config.fp32_layernorm
+        self.fp32_embedding = config.fp32_embedding
+        self.fp32_tokentypes = config.fp32_tokentypes
+        self.LayerNorm = BertLayerNorm(
+            config.hidden_size, eps=config.layernorm_epsilon)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, input_ids, token_type_ids=None):
+        seq_length = input_ids.size(1)
+        position_ids = torch.arange(
+            seq_length, dtype=torch.long, device=input_ids.device)
+        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+
+        words_embeddings = self.word_embeddings(input_ids)
+        position_embeddings = self.position_embeddings(position_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+        if not self.fp32_tokentypes:
+
+            embeddings = words_embeddings + position_embeddings + token_type_embeddings
+            if self.fp32_embedding and not self.fp32_layernorm:
+                embeddings = embeddings.half()
+            previous_type = embeddings.type()
+            if self.fp32_layernorm:
+                embeddings = embeddings.float()
+            embeddings = self.LayerNorm(embeddings)
+            if self.fp32_layernorm:
+                if self.fp32_embedding:
+                    embeddings = embeddings.half()
+                else:
+                    embeddings = embeddings.type(previous_type)
+        else:
+            embeddings = words_embeddings.float() + position_embeddings.float(
+            ) + token_type_embeddings.float()
+            if self.fp32_tokentypes and not self.fp32_layernorm:
+                embeddings = embeddings.half()
+            previous_type = embeddings.type()
+            if self.fp32_layernorm:
+                embeddings = embeddings.float()
+            embeddings = self.LayerNorm(embeddings)
+            if self.fp32_layernorm:
+                if self.fp32_tokentypes:
+                    embeddings = embeddings.half()
+                else:
+                    embeddings = embeddings.type(previous_type)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class BertSelfAttention(nn.Module):
+
+    def __init__(self, config):
+        super(BertSelfAttention, self).__init__()
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                'The hidden size (%d) is not a multiple of the number of attention '
+                'heads (%d)' %
+                (config.hidden_size, config.num_attention_heads))
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size
+                                       / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads,
+                                       self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(self, hidden_states, attention_mask):
+        mixed_query_layer = self.query(hidden_states)
+        mixed_key_layer = self.key(hidden_states)
+        mixed_value_layer = self.value(hidden_states)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        key_layer = self.transpose_for_scores(mixed_key_layer)
+        value_layer = self.transpose_for_scores(mixed_value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer,
+                                        key_layer.transpose(-1, -2))
+        attention_scores = attention_scores / math.sqrt(
+            self.attention_head_size)
+        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+        attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        previous_type = attention_probs.type()  # noqa
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (
+            self.all_head_size, )
+        context_layer = context_layer.view(*new_context_layer_shape)
+        return context_layer
+
+
+class BertSelfOutput(nn.Module):
+
+    def __init__(self, config):
+        super(BertSelfOutput, self).__init__()
+        if hasattr(config, 'deep_init') and config.deep_init:
+            init_method = scaled_init_method(
+                mean=0.0,
+                std=config.initializer_range,
+                num_layers=config.num_hidden_layers)
+        else:
+            init_method = normal_init_method(  # noqa
+                mean=0.0, std=config.initializer_range)
+        self.dense = nn.Linear(
+            config.hidden_size, config.hidden_size, bias=True)
+        # self.dense = mpu.RowParallelLinear(
+        #     input_size=config.hidden_size,
+        #     output_size=config.hidden_size,
+        #     bias=True,
+        #     input_is_parallel=True,
+        #     stride=1,
+        #     init_method=init_method)
+        self.fp32_layernorm = config.fp32_layernorm
+        self.LayerNorm = BertLayerNorm(
+            config.hidden_size, eps=config.layernorm_epsilon)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        ln_input = hidden_states + input_tensor
+        previous_type = ln_input.type()
+        if self.fp32_layernorm:
+            ln_input = ln_input.float()
+        hidden_states = self.LayerNorm(ln_input)
+        if self.fp32_layernorm:
+            hidden_states = hidden_states.type(previous_type)
+        return hidden_states
+
+
+class BertAttention(nn.Module):
+
+    def __init__(self, config):
+        super(BertAttention, self).__init__()
+        self.self = BertSelfAttention(config)
+        # self.self = mpu.BertParallelSelfAttention(
+        #     hidden_size=config.hidden_size,
+        #     num_attention_heads=config.num_attention_heads,
+        #     dropout_prob=config.attention_probs_dropout_prob,
+        #     output_parallel=True,
+        #     init_method=normal_init_method(mean=0.0,
+        #                                    std=config.initializer_range))
+        self.output = BertSelfOutput(config)
+
+    def forward(self, input_tensor, attention_mask):
+        self_output = self.self(input_tensor, attention_mask)
+        attention_output = self.output(self_output, input_tensor)
+        return attention_output
+
+
+class BertIntermediate(nn.Module):
+
+    def __init__(self, config):
+        super(BertIntermediate, self).__init__()
+        self.dense = nn.Linear(
+            config.hidden_size, config.intermediate_size, bias=True)
+        # self.dense = mpu.ColumnParallelLinear(
+        #     input_size=config.hidden_size,
+        #     output_size=config.intermediate_size,
+        #     bias=True,
+        #     gather_output=False,
+        #     stride=1,
+        #     init_method=normal_init_method(mean=0.0,
+        #                                    std=config.initializer_range))
+        self.intermediate_act_fn = ACT2FN[config.hidden_act] \
+            if isinstance(config.hidden_act, str) else config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class BertOutput(nn.Module):
+
+    def __init__(self, config):
+        super(BertOutput, self).__init__()
+        if hasattr(config, 'deep_init') and config.deep_init:
+            init_method = scaled_init_method(
+                mean=0.0,
+                std=config.initializer_range,
+                num_layers=config.num_hidden_layers)
+        else:
+            init_method = normal_init_method(  # noqa
+                mean=0.0, std=config.initializer_range)
+        self.dense = nn.Linear(
+            config.intermediate_size, config.hidden_size, bias=True)
+        # self.dense = mpu.RowParallelLinear(
+        #     input_size=config.intermediate_size,
+        #     output_size=config.hidden_size,
+        #     bias=True,
+        #     input_is_parallel=True,
+        #     stride=1,
+        #     init_method=init_method)
+        self.fp32_layernorm = config.fp32_layernorm
+        self.LayerNorm = BertLayerNorm(
+            config.hidden_size, eps=config.layernorm_epsilon)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        ln_input = hidden_states + input_tensor
+        previous_type = ln_input.type()
+        if self.fp32_layernorm:
+            ln_input = ln_input.float()
+        hidden_states = self.LayerNorm(ln_input)
+        if self.fp32_layernorm:
+            hidden_states = hidden_states.type(previous_type)
+        return hidden_states
+
+
+class BertLayer(nn.Module):
+
+    def __init__(self, config):
+        super(BertLayer, self).__init__()
+        self.attention = BertAttention(config)
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+
+    def forward(self, hidden_states, attention_mask):
+        attention_output = self.attention(hidden_states, attention_mask)
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class BertEncoder(nn.Module):
+
+    def __init__(self, config):
+        super(BertEncoder, self).__init__()
+        # layer = BertLayer(config)
+        # self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
+        self.layer = nn.ModuleList(
+            [BertLayer(config) for _ in range(config.num_hidden_layers)])
+
+    # def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True):
+    #     all_encoder_layers = []
+    #     for layer_module in self.layer:
+    #         hidden_states = layer_module(hidden_states, attention_mask)
+    #         if output_all_encoded_layers:
+    #             all_encoder_layers.append(hidden_states)
+    #     if not output_all_encoded_layers:
+    #         all_encoder_layers.append(hidden_states)
+    #     return all_encoder_layers
+    def forward(self,
+                hidden_states,
+                attention_mask,
+                output_all_encoded_layers=True,
+                checkpoint_activations=False):
+        all_encoder_layers = []
+
+        def custom(start, end):
+
+            def custom_forward(*inputs):
+                layers = self.layer[start:end]
+                x_ = inputs[0]
+                for layer in layers:
+                    x_ = layer(x_, inputs[1])
+                return x_
+
+            return custom_forward
+
+        if checkpoint_activations:
+            l = 0  # noqa
+            num_layers = len(self.layer)
+            chunk_length = 1  # math.ceil(math.sqrt(num_layers))
+            while l < num_layers:
+                hidden_states = mpu.checkpoint(
+                    custom(l, l + chunk_length), hidden_states,
+                    attention_mask * 1)
+                l += chunk_length  # noqa
+            # decoder layers
+        else:
+            for i, layer_module in enumerate(self.layer):
+                hidden_states = layer_module(hidden_states, attention_mask)
+
+                if output_all_encoded_layers:
+                    all_encoder_layers.append(hidden_states)
+
+        if not output_all_encoded_layers or checkpoint_activations:
+            all_encoder_layers.append(hidden_states)
+        return all_encoder_layers
+
+
+class BertPooler(nn.Module):
+
+    def __init__(self, config):
+        super(BertPooler, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class BertPredictionHeadTransform(nn.Module):
+
+    def __init__(self, config):
+        super(BertPredictionHeadTransform, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.transform_act_fn = ACT2FN[config.hidden_act] \
+            if isinstance(config.hidden_act, str) else config.hidden_act
+        self.LayerNorm = BertLayerNorm(
+            config.hidden_size, eps=config.layernorm_epsilon)
+        self.fp32_layernorm = config.fp32_layernorm
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        previous_type = hidden_states.type()
+        if self.fp32_layernorm:
+            hidden_states = hidden_states.float()
+        hidden_states = self.LayerNorm(hidden_states)
+        if self.fp32_layernorm:
+            hidden_states = hidden_states.type(previous_type)
+        return hidden_states
+
+
+class BertLMPredictionHead(nn.Module):
+
+    def __init__(self, config, bert_model_embedding_weights):
+        super(BertLMPredictionHead, self).__init__()
+        self.transform = BertPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(
+            bert_model_embedding_weights.size(1),
+            bert_model_embedding_weights.size(0),
+            bias=False)
+        # self.decoder_weight = bert_model_embedding_weights
+        # self.bias = nn.Parameter(torch.zeros(bert_model_embedding_weights.size(0)))
+        # self.bias.model_parallel = True
+        self.fp32_embedding = config.fp32_embedding
+        self.fp32_layernorm = config.fp32_layernorm
+
+        def convert_to_type(tensor):
+            if self.fp32_embedding:
+                return tensor.half()
+            else:
+                return tensor
+
+        self.type_converter = convert_to_type
+        self.converted = False
+
+    def forward(self, hidden_states):
+        if not self.converted:
+            self.converted = True
+            if self.fp32_embedding:
+                self.transform.half()
+                if self.fp32_layernorm:
+                    self.transform.LayerNorm.float()
+        hidden_states = self.transform(self.type_converter(hidden_states))
+        hidden_states = self.decoder(hidden_states) + self.bias
+        # hidden_states = mpu.copy_to_model_parallel_region(hidden_states)
+        # hidden_states = F.linear(self.type_converter(hidden_states),
+        #                          self.type_converter(self.decoder_weight),
+        #                          self.type_converter(self.bias))
+        return hidden_states
+
+
+class BertOnlyMLMHead(nn.Module):
+
+    def __init__(self, config, bert_model_embedding_weights):
+        super(BertOnlyMLMHead, self).__init__()
+        self.predictions = BertLMPredictionHead(config,
+                                                bert_model_embedding_weights)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class BertOnlyNSPHead(nn.Module):
+
+    def __init__(self, config):
+        super(BertOnlyNSPHead, self).__init__()
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, pooled_output):
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return seq_relationship_score
+
+
+class BertPreTrainingHeads(nn.Module):
+
+    def __init__(self, config, bert_model_embedding_weights):
+        super(BertPreTrainingHeads, self).__init__()
+        self.predictions = BertLMPredictionHead(config,
+                                                bert_model_embedding_weights)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        for p in self.seq_relationship.parameters():
+            if p is None:
+                continue
+            pooled_output = pooled_output.type_as(p)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+
+class PreTrainedBertModel(nn.Module):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+
+    def __init__(self, config, *inputs, **kwargs):
+        super(PreTrainedBertModel, self).__init__()
+        if not isinstance(config, BertConfig):
+            raise ValueError(
+                'Parameter config in `{}(config)` should be an instance of class `BertConfig`. '
+                'To create a model from a Google pretrained model use '
+                '`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`'.format(
+                    self.__class__.__name__, self.__class__.__name__))
+        self.config = config
+
+    def init_bert_weights(self, module):
+        """ Initialize the weights.
+        """
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, BertLayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+    @classmethod
+    def from_pretrained(cls,
+                        pretrained_model_name,
+                        state_dict=None,
+                        cache_dir=None,
+                        fp32_layernorm=False,
+                        fp32_embedding=False,
+                        layernorm_epsilon=1e-12,
+                        fp32_tokentypes=False,
+                        *inputs,
+                        **kwargs):
+        """
+        Instantiate a PreTrainedBertModel from a pre-trained model file or a pytorch state dict.
+        Download and cache the pre-trained model file if needed.
+
+        Params:
+            pretrained_model_name: either:
+                - a str with the name of a pre-trained model to load selected in the list of:
+                    . `bert-base-uncased`
+                    . `bert-large-uncased`
+                    . `bert-base-cased`
+                    . `bert-large-cased`
+                    . `bert-base-multilingual-uncased`
+                    . `bert-base-multilingual-cased`
+                    . `bert-base-chinese`
+                - a path or url to a pretrained model archive containing:
+                    . `bert_config.json` a configuration file for the model
+                    . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance
+            cache_dir: an optional path to a folder in which the pre-trained models will be cached.
+            state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
+            *inputs, **kwargs: additional input for the specific Bert class
+                (ex: num_labels for BertForSequenceClassification)
+        """ # noqa
+        if pretrained_model_name in PRETRAINED_MODEL_ARCHIVE_MAP:
+            archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name]
+        else:
+            archive_file = pretrained_model_name
+        # redirect to the cache, if necessary
+        try:
+            resolved_archive_file = cached_path(
+                archive_file, cache_dir=cache_dir)
+        except FileNotFoundError:
+            logger.error(
+                "Model name '{}' was not found in model name list ({}). "
+                "We assumed '{}' was a path or url but couldn't find any file "
+                'associated to this path or url.'.format(
+                    pretrained_model_name,
+                    ', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()),
+                    archive_file))
+            return None
+        if resolved_archive_file == archive_file:
+            logger.info('loading archive file {}'.format(archive_file))
+        else:
+            logger.info('loading archive file {} from cache at {}'.format(
+                archive_file, resolved_archive_file))
+        tempdir = None
+        if os.path.isdir(resolved_archive_file):
+            serialization_dir = resolved_archive_file
+        else:
+            # Extract archive to temp dir
+            tempdir = tempfile.mkdtemp()
+            logger.info('extracting archive file {} to temp dir {}'.format(
+                resolved_archive_file, tempdir))
+            with tarfile.open(resolved_archive_file, 'r:gz') as archive:
+                archive.extractall(tempdir)
+            serialization_dir = tempdir
+        # Load config
+        config_file = os.path.join(serialization_dir, CONFIG_NAME)
+        config = BertConfig.from_json_file(config_file)
+        config.fp32_layernorm = fp32_layernorm
+        config.fp32_embedding = fp32_embedding
+        config.layernorm_epsilon = layernorm_epsilon
+        config.fp32_tokentypes = fp32_tokentypes
+        logger.info('Model config {}'.format(config))
+        # Instantiate model.
+        model = cls(config, *inputs, **kwargs)
+        if state_dict is None:
+            weights_path = os.path.join(serialization_dir, WEIGHTS_NAME)
+            state_dict = torch.load(weights_path)
+
+        old_keys = []
+        new_keys = []
+        for key in state_dict.keys():
+            new_key = None
+            if 'gamma' in key:
+                new_key = key.replace('gamma', 'weight')
+            if 'beta' in key:
+                new_key = key.replace('beta', 'bias')
+            if new_key:
+                old_keys.append(key)
+                new_keys.append(new_key)
+        for old_key, new_key in zip(old_keys, new_keys):
+            state_dict[new_key] = state_dict.pop(old_key)
+
+        missing_keys = []
+        unexpected_keys = []
+        error_msgs = []
+        # copy state_dict so _load_from_state_dict can modify it
+        metadata = getattr(state_dict, '_metadata', None)
+        state_dict = state_dict.copy()
+        if metadata is not None:
+            state_dict._metadata = metadata
+
+        def load(module, prefix=''):
+            local_metadata = {} if metadata is None else metadata.get(
+                prefix[:-1], {})
+            module._load_from_state_dict(state_dict, prefix, local_metadata,
+                                         True, missing_keys, unexpected_keys,
+                                         error_msgs)
+            for name, child in module._modules.items():
+                if child is not None:
+                    load(child, prefix + name + '.')
+
+        load(model, prefix='' if hasattr(model, 'bert') else 'bert.')
+        if len(missing_keys) > 0:
+            print('Weights of {} not initialized from pretrained model: {}'.
+                  format(model.__class__.__name__, missing_keys))
+        if len(unexpected_keys) > 0:
+            print('Weights from pretrained model not used in {}: {}'.format(
+                model.__class__.__name__, unexpected_keys))
+        if tempdir:
+            # Clean up temp dir
+            shutil.rmtree(tempdir)
+        return model
+
+
+class BertModel(PreTrainedBertModel):
+    """BERT model ("Bidirectional Embedding Representations from a Transformer").
+
+    Params:
+        config: a BertConfig class instance with the configuration to build a new model
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`.
+
+    Outputs: Tuple of (encoded_layers, pooled_output)
+        `encoded_layers`: controled by `output_all_encoded_layers` argument:
+            - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
+                of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each
+                encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
+            - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
+                to the last attention block of shape [batch_size, sequence_length, hidden_size],
+        `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
+            classifier pretrained on top of the hidden state associated to the first character of the
+            input (`CLF`) to train on the Next-Sentence task (see BERT's paper).
+
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+    config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+    model = modeling.BertModel(config=config)
+    all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
+    ```
+    """ # noqa
+
+    def __init__(self, config):
+        super(BertModel, self).__init__(config)
+        self.embeddings = BertEmbeddings(config)
+        self.encoder = BertEncoder(config)
+        self.pooler = BertPooler(config)
+        self.apply(self.init_bert_weights)
+
+    def forward(self,
+                input_ids,
+                token_type_ids=None,
+                attention_mask=None,
+                output_all_encoded_layers=True,
+                checkpoint_activations=False):
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+
+        # We create a 3D attention mask from a 2D tensor mask.
+        # Sizes are [batch_size, 1, 1, to_seq_length]
+        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+        # this attention mask is more simple than the triangular masking of causal attention
+        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(
+            dtype=next(self.encoder.parameters()).dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+
+        embedding_output = self.embeddings(input_ids, token_type_ids)
+        encoded_layers = self.encoder(
+            embedding_output,
+            extended_attention_mask,
+            output_all_encoded_layers=output_all_encoded_layers,
+            checkpoint_activations=checkpoint_activations)
+        sequence_output = encoded_layers[-1]
+        for p in self.pooler.parameters():
+            if p is None:
+                continue
+            sequence_output = sequence_output.type_as(p)
+            break
+        pooled_output = self.pooler(sequence_output)
+        if not output_all_encoded_layers or checkpoint_activations:
+            encoded_layers = encoded_layers[-1]
+        return encoded_layers, pooled_output
+
+
+class BertForPreTraining(PreTrainedBertModel):
+    """BERT model with pre-training heads.
+    This module comprises the BERT model followed by the two pre-training heads:
+        - the masked language modeling head, and
+        - the next sentence classification head.
+
+    Params:
+        config: a BertConfig class instance with the configuration to build a new model.
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `masked_lm_labels`: masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
+            with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
+            is only computed for the labels set in [0, ..., vocab_size]
+        `next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size]
+            with indices selected in [0, 1].
+            0 => next sentence is the continuation, 1 => next sentence is a random sentence.
+
+    Outputs:
+        if `masked_lm_labels` and `next_sentence_label` are not `None`:
+            Outputs the total_loss which is the sum of the masked language modeling loss and the next
+            sentence classification loss.
+        if `masked_lm_labels` or `next_sentence_label` is `None`:
+            Outputs a tuple comprising
+            - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and
+            - the next sentence classification logits of shape [batch_size, 2].
+
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+    model = BertForPreTraining(config)
+    masked_lm_logits_scores, seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+
+    def __init__(self, config):
+        super(BertForPreTraining, self).__init__(config)
+        self.bert = BertModel(config)
+        self.cls = BertPreTrainingHeads(
+            config, self.bert.embeddings.word_embeddings.weight)
+        self.apply(self.init_bert_weights)
+
+    def forward(self,
+                input_ids,
+                token_type_ids=None,
+                attention_mask=None,
+                masked_lm_labels=None,
+                next_sentence_label=None,
+                checkpoint_activations=False):
+        sequence_output, pooled_output = self.bert(
+            input_ids,
+            token_type_ids,
+            attention_mask,
+            output_all_encoded_layers=False,
+            checkpoint_activations=checkpoint_activations)
+        prediction_scores, seq_relationship_score = self.cls(
+            sequence_output, pooled_output)
+
+        if masked_lm_labels is not None and next_sentence_label is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            masked_lm_loss = loss_fct(
+                prediction_scores.view(-1, self.config.vocab_size).float(),
+                masked_lm_labels.view(-1))
+            next_sentence_loss = loss_fct(
+                seq_relationship_score.view(-1, 2).float(),
+                next_sentence_label.view(-1))
+            total_loss = masked_lm_loss + next_sentence_loss
+            return total_loss
+        else:
+            return prediction_scores, seq_relationship_score
+
+
+class BertForMaskedLM(PreTrainedBertModel):
+    """BERT model with the masked language modeling head.
+    This module comprises the BERT model followed by the masked language modeling head.
+
+    Params:
+        config: a BertConfig class instance with the configuration to build a new model.
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `masked_lm_labels`: masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
+            with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
+            is only computed for the labels set in [0, ..., vocab_size]
+
+    Outputs:
+        if `masked_lm_labels` is  not `None`:
+            Outputs the masked language modeling loss.
+        if `masked_lm_labels` is `None`:
+            Outputs the masked language modeling logits of shape [batch_size, sequence_length, vocab_size].
+
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+    model = BertForMaskedLM(config)
+    masked_lm_logits_scores = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+
+    def __init__(self, config):
+        super(BertForMaskedLM, self).__init__(config)
+        self.bert = BertModel(config)
+        self.cls = BertOnlyMLMHead(config,
+                                   self.bert.embeddings.word_embeddings.weight)
+        self.apply(self.init_bert_weights)
+
+    def forward(self,
+                input_ids,
+                token_type_ids=None,
+                attention_mask=None,
+                masked_lm_labels=None,
+                checkpoint_activations=False):
+        sequence_output, _ = self.bert(
+            input_ids,
+            token_type_ids,
+            attention_mask,
+            output_all_encoded_layers=False,
+            checkpoint_activations=checkpoint_activations)
+        prediction_scores = self.cls(sequence_output)
+
+        if masked_lm_labels is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            masked_lm_loss = loss_fct(
+                prediction_scores.view(-1, self.config.vocab_size),
+                masked_lm_labels.view(-1))
+            return masked_lm_loss
+        else:
+            return prediction_scores
+
+
+class BertForNextSentencePrediction(PreTrainedBertModel):
+    """BERT model with next sentence prediction head.
+    This module comprises the BERT model followed by the next sentence classification head.
+
+    Params:
+        config: a BertConfig class instance with the configuration to build a new model.
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size]
+            with indices selected in [0, 1].
+            0 => next sentence is the continuation, 1 => next sentence is a random sentence.
+
+    Outputs:
+        if `next_sentence_label` is not `None`:
+            Outputs the total_loss which is the sum of the masked language modeling loss and the next
+            sentence classification loss.
+        if `next_sentence_label` is `None`:
+            Outputs the next sentence classification logits of shape [batch_size, 2].
+
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+    model = BertForNextSentencePrediction(config)
+    seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+
+    def __init__(self, config):
+        super(BertForNextSentencePrediction, self).__init__(config)
+        self.bert = BertModel(config)
+        self.cls = BertOnlyNSPHead(config)
+        self.apply(self.init_bert_weights)
+
+    def forward(self,
+                input_ids,
+                token_type_ids=None,
+                attention_mask=None,
+                next_sentence_label=None,
+                checkpoint_activations=False):
+        _, pooled_output = self.bert(
+            input_ids,
+            token_type_ids,
+            attention_mask,
+            output_all_encoded_layers=False,
+            checkpoint_activations=checkpoint_activations)
+        seq_relationship_score = self.cls(pooled_output)
+
+        if next_sentence_label is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            next_sentence_loss = loss_fct(
+                seq_relationship_score.view(-1, 2),
+                next_sentence_label.view(-1))
+            return next_sentence_loss
+        else:
+            return seq_relationship_score
+
+
+class BertForSequenceClassification(PreTrainedBertModel):
+    """BERT model for classification.
+    This module is composed of the BERT model with a linear layer on top of
+    the pooled output.
+
+    Params:
+        `config`: a BertConfig class instance with the configuration to build a new model.
+        `num_labels`: the number of classes for the classifier. Default = 2.
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
+            with indices selected in [0, ..., num_labels].
+
+    Outputs:
+        if `labels` is not `None`:
+            Outputs the CrossEntropy classification loss of the output with the labels.
+        if `labels` is `None`:
+            Outputs the classification logits of shape [batch_size, num_labels].
+
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+    num_labels = 2
+
+    model = BertForSequenceClassification(config, num_labels)
+    logits = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+
+    def __init__(self, config, num_labels=2):
+        super(BertForSequenceClassification, self).__init__(config)
+        self.num_labels = num_labels
+        self.bert = BertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, num_labels)
+        self.apply(self.init_bert_weights)
+
+    def forward(self,
+                input_ids,
+                token_type_ids=None,
+                attention_mask=None,
+                labels=None,
+                checkpoint_activations=False):
+        _, pooled_output = self.bert(
+            input_ids,
+            token_type_ids,
+            attention_mask,
+            output_all_encoded_layers=False,
+            checkpoint_activations=checkpoint_activations)
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            return loss
+        else:
+            return logits
+
+
+class BertForMultipleChoice(PreTrainedBertModel):
+    """BERT model for multiple choice tasks.
+    This module is composed of the BERT model with a linear layer on top of
+    the pooled output.
+
+    Params:
+        `config`: a BertConfig class instance with the configuration to build a new model.
+        `num_choices`: the number of classes for the classifier. Default = 2.
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length]
+            with the token types indices selected in [0, 1]. Type 0 corresponds to a `sentence A`
+            and type 1 corresponds to a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
+            with indices selected in [0, ..., num_choices].
+
+    Outputs:
+        if `labels` is not `None`:
+            Outputs the CrossEntropy classification loss of the output with the labels.
+        if `labels` is `None`:
+            Outputs the classification logits of shape [batch_size, num_labels].
+
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]], [[12, 16, 42], [14, 28, 57]]])
+    input_mask = torch.LongTensor([[[1, 1, 1], [1, 1, 0]],[[1,1,0], [1, 0, 0]]])
+    token_type_ids = torch.LongTensor([[[0, 0, 1], [0, 1, 0]],[[0, 1, 1], [0, 0, 1]]])
+    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+    num_choices = 2
+
+    model = BertForMultipleChoice(config, num_choices)
+    logits = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+
+    def __init__(self, config):
+        super(BertForMultipleChoice, self).__init__(config)
+        self.bert = BertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+        self.apply(self.init_bert_weights)
+
+    def forward(self,
+                input_ids,
+                token_type_ids=None,
+                attention_mask=None,
+                labels=None,
+                checkpoint_activations=False):
+        batch_size, num_choices = input_ids.shape[:2]
+        flat_input_ids = input_ids.reshape(-1, input_ids.size(-1))
+        flat_token_type_ids = token_type_ids.reshape(-1,
+                                                     token_type_ids.size(-1))
+        flat_attention_mask = attention_mask.reshape(-1,
+                                                     attention_mask.size(-1))
+        _, pooled_output = self.bert(
+            flat_input_ids,
+            flat_token_type_ids,
+            flat_attention_mask,
+            output_all_encoded_layers=False,
+            checkpoint_activations=checkpoint_activations)
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.reshape(-1, num_choices)
+
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+            return loss
+        else:
+            return reshaped_logits
+
+
+class BertForTokenClassification(PreTrainedBertModel):
+    """BERT model for token-level classification.
+    This module is composed of the BERT model with a linear layer on top of
+    the full hidden state of the last layer.
+
+    Params:
+        `config`: a BertConfig class instance with the configuration to build a new model.
+        `num_labels`: the number of classes for the classifier. Default = 2.
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
+            with indices selected in [0, ..., num_labels].
+
+    Outputs:
+        if `labels` is not `None`:
+            Outputs the CrossEntropy classification loss of the output with the labels.
+        if `labels` is `None`:
+            Outputs the classification logits of shape [batch_size, sequence_length, num_labels].
+
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+    num_labels = 2
+
+    model = BertForTokenClassification(config, num_labels)
+    logits = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+
+    def __init__(self, config, num_labels=2):
+        super(BertForTokenClassification, self).__init__(config)
+        self.num_labels = num_labels
+        self.bert = BertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, num_labels)
+        # self.classifier = mpu.RowParallelLinear(
+        #     input_size=config.hidden_size,
+        #     output_size=num_labels,
+        #     bias=True,
+        #     input_is_parallel=True,
+        #     stride=1,
+        #     init_method=normal_init_method(mean=0.0,
+        #                                    std=config.initializer_range))
+        self.apply(self.init_bert_weights)
+
+    def forward(self,
+                input_ids,
+                token_type_ids=None,
+                attention_mask=None,
+                labels=None,
+                checkpoint_activations=False):
+        sequence_output, _ = self.bert(
+            input_ids,
+            token_type_ids,
+            attention_mask,
+            output_all_encoded_layers=False,
+            checkpoint_activations=checkpoint_activations)
+        with mpu.get_cuda_rng_tracker().fork():
+            sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            return loss
+        else:
+            return logits
+
+
+class BertForQuestionAnswering(PreTrainedBertModel):
+    """BERT model for Question Answering (span extraction).
+    This module is composed of the BERT model with a linear layer on top of
+    the sequence output that computes start_logits and end_logits
+
+    Params:
+        `config`: a BertConfig class instance with the configuration to build a new model.
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `start_positions`: position of the first token for the labeled span: torch.LongTensor of shape [batch_size].
+            Positions are clamped to the length of the sequence and position outside of the sequence are not taken
+            into account for computing the loss.
+        `end_positions`: position of the last token for the labeled span: torch.LongTensor of shape [batch_size].
+            Positions are clamped to the length of the sequence and position outside of the sequence are not taken
+            into account for computing the loss.
+
+    Outputs:
+        if `start_positions` and `end_positions` are not `None`:
+            Outputs the total_loss which is the sum of the CrossEntropy loss for the start and end token positions.
+        if `start_positions` or `end_positions` is `None`:
+            Outputs a tuple of start_logits, end_logits which are the logits respectively for the start and end
+            position tokens of shape [batch_size, sequence_length].
+
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+    model = BertForQuestionAnswering(config)
+    start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+
+    def __init__(self, config):
+        super(BertForQuestionAnswering, self).__init__(config)
+        self.bert = BertModel(config)
+        # TODO check with Google if it's normal there is no dropout on the token classifier of SQuAD in the TF version
+        # self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.qa_outputs = nn.Linear(config.hidden_size, 2)
+        # self.qa_outputs = mpu.RowParallelLinear(
+        #     input_size=config.hidden_size,
+        #     output_size=2,
+        #     bias=True,
+        #     input_is_parallel=True,
+        #     stride=1,
+        #     init_method=normal_init_method(mean=0.0,
+        #                                    std=config.initializer_range))
+        self.apply(self.init_bert_weights)
+
+    def forward(self,
+                input_ids,
+                token_type_ids=None,
+                attention_mask=None,
+                start_positions=None,
+                end_positions=None,
+                checkpoint_activations=False):
+        sequence_output, _ = self.bert(
+            input_ids,
+            token_type_ids,
+            attention_mask,
+            output_all_encoded_layers=False,
+            checkpoint_activations=checkpoint_activations)
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)
+
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions.clamp_(0, ignored_index)
+            end_positions.clamp_(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+            return total_loss
+        else:
+            return start_logits, end_logits
diff --git a/modelscope/models/nlp/mglm/model/modeling_glm.py b/modelscope/models/nlp/mglm/model/modeling_glm.py
new file mode 100644
index 00000000..80f61cef
--- /dev/null
+++ b/modelscope/models/nlp/mglm/model/modeling_glm.py
@@ -0,0 +1,245 @@
+# Modified by Zhipu.AI
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""GPT-2 model."""
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from modelscope.models.nlp.mglm import mpu
+from modelscope.models.nlp.mglm.model.prompt import PromptSpell
+from modelscope.models.nlp.mglm.utils import print_rank_0
+
+
+def init_method_normal(std=0.02):
+    """Init method based on normal distribution.
+
+    This is only used for embeddings. The transformer has its
+    own initializer.
+    """
+
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=0.0, std=std)
+
+    return init_
+
+
+class GLMModel(torch.nn.Module):
+    """GLM Language model.
+
+    The output of the forward method are the logits (parallel or
+    serial depending on the `parallel_output` flag.
+    """
+
+    def __init__(
+        self,
+        num_layers,
+        vocab_size,
+        hidden_size,
+        num_attention_heads,
+        embedding_dropout_prob,
+        attention_dropout_prob,
+        output_dropout_prob,
+        max_sequence_length,
+        max_memory_length,
+        checkpoint_activations,
+        checkpoint_num_layers=1,
+        parallel_output=True,
+        relative_encoding=False,
+        block_position_encoding=False,
+        output_predict=True,
+        spell_length=None,
+        spell_func='lstm',
+        attention_scale=1.0,
+    ):
+
+        super(GLMModel, self).__init__()
+
+        self.parallel_output = parallel_output
+        self.output_predict = output_predict
+        self.hidden_size = hidden_size
+
+        init_method = init_method_normal(std=0.02)
+
+        # Word embeddings (parallel).
+        self.word_embeddings = mpu.VocabParallelEmbedding(
+            vocab_size, hidden_size, init_method=init_method)
+
+        # Transformer
+        self.transformer = mpu.GPT2ParallelTransformer(
+            num_layers,
+            hidden_size,
+            num_attention_heads,
+            max_sequence_length,
+            max_memory_length,
+            embedding_dropout_prob,
+            attention_dropout_prob,
+            output_dropout_prob,
+            checkpoint_activations,
+            checkpoint_num_layers,
+            attention_scale=attention_scale,
+            relative_encoding=relative_encoding,
+            block_position_encoding=block_position_encoding)
+        if spell_length is not None:
+            self.prompt_spell = PromptSpell(spell_length, self.hidden_size,
+                                            spell_func)
+
+    def freeze_transformer(self, tune_prefix_layers=None):
+        log_str = 'Freeze transformer'
+        self.word_embeddings.requires_grad_(False)
+        self.transformer.requires_grad_(False)
+        if tune_prefix_layers is not None:
+            log_str += f' tune {tune_prefix_layers} prefix layers'
+            for i in range(tune_prefix_layers):
+                self.transformer.layers[i].requires_grad_(True)
+        print_rank_0(log_str)
+
+    def forward(self,
+                input_ids,
+                position_ids,
+                attention_mask,
+                *mems,
+                return_memory=False,
+                detach_memory=True,
+                prompt_pos=None):
+        # Embeddings.
+        batch_size = input_ids.size(0)
+        words_embeddings = self.word_embeddings(input_ids)
+        embeddings = words_embeddings
+        if prompt_pos is not None:
+            embeddings = embeddings.clone()
+            prompt_embeds = self.prompt_spell()
+            batch_index = torch.arange(
+                batch_size, device=input_ids.device).unsqueeze(1)
+            embeddings[batch_index, prompt_pos] = prompt_embeds
+        # Transformer.
+        transformer_output = self.transformer(
+            embeddings,
+            position_ids,
+            attention_mask,
+            mems,
+            return_memory=return_memory,
+            detach_memory=detach_memory)
+        logits, hidden_layers = transformer_output
+        outputs = hidden_layers
+
+        if self.output_predict:
+            # Parallel logits.
+            logits_parallel = mpu.copy_to_model_parallel_region(logits)
+            logits_parallel = F.linear(logits_parallel,
+                                       self.word_embeddings.weight)
+
+            if self.parallel_output:
+                return (logits_parallel, *outputs)
+
+            return (mpu.gather_from_model_parallel_region(logits_parallel),
+                    *outputs)
+        else:
+            return (logits, *outputs)
+
+
+class EncoderDecoder(torch.nn.Module):
+    """Seq2Seq Transformer Model
+    The output of the forward method are the logits (parallel or serial depending on the `parallel_output` flag).
+    """
+
+    def __init__(self,
+                 num_layers,
+                 vocab_size,
+                 hidden_size,
+                 num_attention_heads,
+                 embedding_dropout_prob,
+                 attention_dropout_prob,
+                 output_dropout_prob,
+                 max_sequence_length,
+                 max_memory_length,
+                 checkpoint_activations,
+                 checkpoint_num_layers=1,
+                 parallel_output=True,
+                 output_predict=True):
+        super(EncoderDecoder, self).__init__()
+
+        self.parallel_output = parallel_output
+        self.output_predict = output_predict
+
+        init_method = init_method_normal(std=0.02)
+
+        # Word embeddings (parallel).
+        self.word_embeddings = mpu.VocabParallelEmbedding(
+            vocab_size, hidden_size, init_method=init_method)
+
+        # Transformer
+        self.encoder = mpu.GPT2ParallelTransformer(
+            num_layers, hidden_size, num_attention_heads, max_sequence_length,
+            max_memory_length, embedding_dropout_prob, attention_dropout_prob,
+            output_dropout_prob, checkpoint_activations, checkpoint_num_layers)
+        self.decoder = mpu.GPT2ParallelTransformer(
+            num_layers,
+            hidden_size,
+            num_attention_heads,
+            max_sequence_length,
+            max_memory_length,
+            embedding_dropout_prob,
+            attention_dropout_prob,
+            output_dropout_prob,
+            checkpoint_activations,
+            checkpoint_num_layers,
+            use_decoder_layer=True)
+
+    def forward(self, source_ids, target_ids, source_position_ids,
+                target_position_ids, source_mask, target_mask):
+        # Embeddings.
+        source_embeddings = self.word_embeddings(source_ids)
+        target_embeddings = self.word_embeddings(target_ids)
+
+        # Transformer.
+        encoder_output, _ = self.encoder(source_embeddings,
+                                         source_position_ids, source_mask)
+        decoder_output, _ = self.decoder(target_embeddings,
+                                         target_position_ids, target_mask)
+        if self.output_predict:
+            # Parallel logits.
+            output_parallel = mpu.copy_to_model_parallel_region(decoder_output)
+            logits_parallel = F.linear(output_parallel,
+                                       self.word_embeddings.weight)
+
+            if self.parallel_output:
+                return (logits_parallel, )
+
+            return (mpu.gather_from_model_parallel_region(logits_parallel), )
+        else:
+            return (decoder_output, )
+
+
+def glm_get_params_for_weight_decay_optimization(module):
+    weight_decay_params = {'params': []}
+    no_weight_decay_params = {'params': [], 'weight_decay': 0.0}
+    for module_ in module.modules():
+        if isinstance(module_, (mpu.LayerNorm, torch.nn.LayerNorm)):
+            no_weight_decay_params['params'].extend([
+                p for p in list(module_._parameters.values())
+                if p is not None and p.requires_grad
+            ])
+        else:
+            weight_decay_params['params'].extend([
+                p for n, p in list(module_._parameters.items())
+                if p is not None and p.requires_grad and n != 'bias'
+            ])
+            no_weight_decay_params['params'].extend([
+                p for n, p in list(module_._parameters.items())
+                if p is not None and p.requires_grad and n == 'bias'
+            ])
+
+    return weight_decay_params, no_weight_decay_params
diff --git a/modelscope/models/nlp/mglm/model/prompt.py b/modelscope/models/nlp/mglm/model/prompt.py
new file mode 100644
index 00000000..a29ceda0
--- /dev/null
+++ b/modelscope/models/nlp/mglm/model/prompt.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2022 Zhipu.AI
+
+import random
+
+import torch
+
+
+class PromptSpell(torch.nn.Module):
+
+    def __init__(self, spell_length, hidden_size, spell_func):
+        super(PromptSpell, self).__init__()
+        self.spell_length = spell_length
+        self.hidden_size = hidden_size
+        self.spell_embeddings = torch.nn.Embedding(self.spell_length,
+                                                   self.hidden_size)
+        self.spell_func = spell_func
+        if self.spell_func == 'lstm':
+            self.lstm_head = torch.nn.LSTM(
+                input_size=self.hidden_size,
+                hidden_size=self.hidden_size,
+                num_layers=2,
+                # dropout=self.lstm_dropout,
+                bidirectional=True,
+                batch_first=True)  # .to(torch.device("cuda"))
+            self.mlp_head = torch.nn.Sequential(
+                torch.nn.Linear(2 * self.hidden_size, self.hidden_size),
+                torch.nn.ReLU(),
+                torch.nn.Linear(self.hidden_size, self.hidden_size))
+        elif self.spell_func == 'mlp':
+            self.mlp_head = torch.nn.Sequential(
+                torch.nn.Linear(self.hidden_size, self.hidden_size),
+                torch.nn.ReLU(),
+                torch.nn.Linear(self.hidden_size, self.hidden_size))
+        elif self.spell_func != 'none':
+            raise NotImplementedError('Prompt function ' + self.spell_func)
+
+    def init_embedding(self, word_embeddings=None, task_tokens=None):
+        num_words = 5000
+        with torch.no_grad():
+            for i in range(self.spell_length):
+                rand_token = random.randrange(num_words)
+                if task_tokens is None:
+                    target_embedding = word_embeddings[rand_token]
+                else:
+                    word_embedding = word_embeddings[rand_token]
+                    task_token = random.choice(task_tokens)
+                    task_embedding = word_embeddings[task_token]
+                    ratio = random.random()
+                    target_embedding = word_embedding * ratio + task_embedding * (
+                        1 - ratio)
+                self.spell_embeddings.weight.data[i] = target_embedding
+
+    def forward(self):
+        prompt_embeds = self.spell_embeddings.weight.unsqueeze(0)
+        if self.spell_func == 'lstm':
+            prompt_embeds = self.lstm_head(prompt_embeds)[0]
+        if self.spell_func == 'lstm' or self.spell_func == 'mlp':
+            prompt_embeds = self.mlp_head(prompt_embeds)
+        return prompt_embeds
diff --git a/modelscope/models/nlp/mglm/mpu/__init__.py b/modelscope/models/nlp/mglm/mpu/__init__.py
new file mode 100755
index 00000000..8cca4e2c
--- /dev/null
+++ b/modelscope/models/nlp/mglm/mpu/__init__.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Model parallel utility interface."""
+
+from .cross_entropy import vocab_parallel_cross_entropy
+from .data import broadcast_data
+from .grads import clip_grad_norm
+from .initialize import (destroy_model_parallel, get_data_parallel_group,
+                         get_data_parallel_rank, get_data_parallel_world_size,
+                         get_model_parallel_group, get_model_parallel_rank,
+                         get_model_parallel_src_rank,
+                         get_model_parallel_world_size,
+                         initialize_model_parallel,
+                         model_parallel_is_initialized)
+from .layers import (ColumnParallelLinear, ParallelEmbedding,
+                     RowParallelLinear, VocabParallelEmbedding)
+from .mappings import (copy_to_model_parallel_region,
+                       gather_from_model_parallel_region,
+                       reduce_from_model_parallel_region,
+                       scatter_to_model_parallel_region)
+from .random import (checkpoint, get_cuda_rng_tracker,
+                     model_parallel_cuda_manual_seed,
+                     partition_activations_in_checkpoint)
+from .transformer import (BertParallelSelfAttention,
+                          BertParallelTransformerLayer,
+                          GPT2ParallelTransformer, LayerNorm)
diff --git a/modelscope/models/nlp/mglm/mpu/cross_entropy.py b/modelscope/models/nlp/mglm/mpu/cross_entropy.py
new file mode 100644
index 00000000..2ebcf7a8
--- /dev/null
+++ b/modelscope/models/nlp/mglm/mpu/cross_entropy.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from .initialize import (get_model_parallel_group, get_model_parallel_rank,
+                         get_model_parallel_world_size)
+from .utils import VocabUtility
+
+
+class _VocabParallelCrossEntropy(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, vocab_parallel_logits, target):
+
+        # Copy so the input remains unchanged.
+        logits = vocab_parallel_logits.clone()
+        # Maximum value along vocab dimension across all GPUs.
+        logits_max = torch.max(logits, dim=-1)[0]
+        torch.distributed.all_reduce(
+            logits_max,
+            op=torch.distributed.ReduceOp.MAX,
+            group=get_model_parallel_group())
+        # Subtract the maximum value.
+        logits.sub_(logits_max.unsqueeze(dim=-1))
+        # Sum of exponential of logits along vocab dimension across all GPUs.
+        exp_logits = logits.exp()
+        sum_exp_logits = exp_logits.sum(dim=-1)
+        torch.distributed.all_reduce(
+            sum_exp_logits,
+            op=torch.distributed.ReduceOp.SUM,
+            group=get_model_parallel_group())
+
+        # Get the partition's vocab indecies
+        get_vocab_range = VocabUtility.vocab_range_from_per_partition_vocab_size
+        partition_vocab_size = vocab_parallel_logits.size()[-1]
+        rank = get_model_parallel_rank()
+        world_size = get_model_parallel_world_size()
+        vocab_start_index, vocab_end_index = get_vocab_range(
+            partition_vocab_size, rank, world_size)
+
+        # Create a mask of valid vocab ids (1 means it needs to be masked).
+        target_mask = (target < vocab_start_index) | (
+            target >= vocab_end_index)
+        masked_target = target.clone() - vocab_start_index
+        masked_target[target_mask] = 0
+
+        # Get predicted-logits = logits[target].
+        # For Simplicity, we convert logits to a 2-D tensor with size
+        # [*, partition-vocab-size] and target to a 1-D tensor of size [*].
+        logits_2d = logits.view(-1, partition_vocab_size)
+        masked_target_1d = masked_target.view(-1)
+        arange_1d = torch.arange(
+            start=0, end=logits_2d.size()[0], device=logits_2d.device)
+        predicted_logits_1d = logits_2d[arange_1d, masked_target_1d]
+        predicted_logits = predicted_logits_1d.view_as(target)
+        predicted_logits[target_mask] = 0.0
+        # All reduce is needed to get the chunks from other GPUs.
+        torch.distributed.all_reduce(
+            predicted_logits,
+            op=torch.distributed.ReduceOp.SUM,
+            group=get_model_parallel_group())
+
+        # Loss = log(sum(exp(logits))) - predicted-logit.
+        loss = torch.log(sum_exp_logits) - predicted_logits
+
+        # Store softmax, target-mask and masked-target for backward pass.
+        exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1))
+        ctx.save_for_backward(exp_logits, target_mask, masked_target_1d)
+
+        return loss
+
+    @staticmethod
+    def backward(ctx, grad_output):
+
+        # Retreive tensors from the forward path.
+        softmax, target_mask, masked_target_1d = ctx.saved_tensors
+
+        # All the inputs have softmax as thier gradient.
+        grad_input = softmax
+        # For simplicity, work with the 2D gradient.
+        partition_vocab_size = softmax.size()[-1]
+        grad_2d = grad_input.view(-1, partition_vocab_size)
+
+        # Add the gradient from matching classes.
+        arange_1d = torch.arange(
+            start=0, end=grad_2d.size()[0], device=grad_2d.device)
+        grad_2d[arange_1d,
+                masked_target_1d] -= (1.0 - target_mask.view(-1).float())
+
+        # Finally elementwise multiplication with the output gradients.
+        grad_input.mul_(grad_output.unsqueeze(dim=-1))
+
+        return grad_input, None
+
+
+def vocab_parallel_cross_entropy(vocab_parallel_logits, target):
+    """Helper function for the cross entropy."""
+    return _VocabParallelCrossEntropy.apply(vocab_parallel_logits, target)
diff --git a/modelscope/models/nlp/mglm/mpu/data.py b/modelscope/models/nlp/mglm/mpu/data.py
new file mode 100644
index 00000000..6f595f0f
--- /dev/null
+++ b/modelscope/models/nlp/mglm/mpu/data.py
@@ -0,0 +1,117 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from .initialize import (get_model_parallel_group, get_model_parallel_rank,
+                         get_model_parallel_src_rank)
+
+_MAX_DATA_DIM = 5
+
+
+def _check_data_types(keys, data, target_dtype):
+    """Check that all the keys have the same target data type."""
+    for key in keys:
+        assert data[key].dtype == target_dtype, '{} has data type {} which '\
+            'is different than {}'.format(key, data[key].dtype, target_dtype)
+
+
+def _build_key_size_numel_dictionaries(keys, data):
+    """Build the size on rank 0 and broadcast."""
+    max_dim = _MAX_DATA_DIM
+    sizes = [0 for _ in range(max_dim) for _ in keys]
+
+    # Pack the sizes on rank zero.
+    if get_model_parallel_rank() == 0:
+        offset = 0
+        for key in keys:
+            assert data[key].dim(
+            ) < max_dim, 'you should increase MAX_DATA_DIM'
+            size = data[key].size()
+            for i, s in enumerate(size):
+                sizes[i + offset] = s
+            offset += max_dim
+
+    # Move to GPU and broadcast.
+    sizes_cuda = torch.cuda.LongTensor(sizes)
+    torch.distributed.broadcast(
+        sizes_cuda,
+        get_model_parallel_src_rank(),
+        group=get_model_parallel_group())
+
+    # Move back to cpu and unpack.
+    sizes_cpu = sizes_cuda.cpu()
+    key_size = {}
+    key_numel = {}
+    total_numel = 0
+    offset = 0
+    for key in keys:
+        i = 0
+        size = []
+        numel = 1
+        while sizes_cpu[offset + i] > 0:
+            this_size = sizes_cpu[offset + i]
+            size.append(this_size)
+            numel *= this_size
+            i += 1
+        key_size[key] = size
+        key_numel[key] = numel
+        total_numel += numel
+        offset += max_dim
+
+    return key_size, key_numel, total_numel
+
+
+def broadcast_data(keys, data, datatype):
+    """Broadcast data from rank zero of each model parallel group to the
+    members of the same model parallel group.
+
+    Arguments:
+        keys: list of keys in the data disctionary to be broadcasted
+        data: data dictionary of string keys and cpu tensor values.
+        datatype: torch data type of all tensors in data associated
+                  with keys.
+    """
+    # Build (key, size) and (key, number of elements) dictionaries along
+    # with the total number of elements on all ranks.
+    key_size, key_numel, total_numel = _build_key_size_numel_dictionaries(
+        keys, data)
+
+    # Pack on rank zero.
+    if get_model_parallel_rank() == 0:
+        # Check that all keys have the same data type.
+        _check_data_types(keys, data, datatype)
+        # Flatten the data associated with the keys
+        flatten_data = torch.cat(
+            [data[key].contiguous().view(-1) for key in keys], dim=0).cuda()
+    else:
+        flatten_data = torch.empty(
+            total_numel, device=torch.cuda.current_device(), dtype=datatype)
+
+    # Boradcast
+    torch.distributed.broadcast(
+        flatten_data,
+        get_model_parallel_src_rank(),
+        group=get_model_parallel_group())
+
+    # Unpack
+    output = {}
+    offset = 0
+    for key in keys:
+        size = key_size[key]
+        numel = key_numel[key]
+        output[key] = flatten_data.narrow(0, offset, numel).view(size)
+        offset += numel
+
+    return output
diff --git a/modelscope/models/nlp/mglm/mpu/grads.py b/modelscope/models/nlp/mglm/mpu/grads.py
new file mode 100644
index 00000000..a7dc6c5c
--- /dev/null
+++ b/modelscope/models/nlp/mglm/mpu/grads.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Parts of the code here are adapted from PyTorch
+# repo: https://github.com/pytorch/pytorch
+
+import torch
+from torch._six import inf
+
+from .initialize import get_model_parallel_group, get_model_parallel_rank
+
+
+def clip_grad_norm(parameters, max_norm, norm_type=2):
+    """Clips gradient norm of an iterable of parameters.
+
+    This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and
+    added functionality to handle model parallel parameters. Note that
+    the gradients are modified in place.
+
+    Arguments:
+        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
+            single Tensor that will have gradients normalized
+        max_norm (float or int): max norm of the gradients
+        norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
+            infinity norm.
+
+    Returns:
+        Total norm of the parameters (viewed as a single vector).
+    """
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+    parameters = list(filter(lambda p: p.grad is not None, parameters))
+    max_norm = float(max_norm)
+    norm_type = float(norm_type)
+    if norm_type == inf:
+        total_norm = max(p.grad.data.abs().max() for p in parameters)
+        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
+        # Take max across all GPUs.
+        torch.distributed.all_reduce(
+            total_norm_cuda,
+            op=torch.distributed.ReduceOp.MAX,
+            group=get_model_parallel_group())
+        total_norm = total_norm_cuda[0].item()
+    else:
+        total_norm = 0
+        for p in parameters:
+            if p.model_parallel or (get_model_parallel_rank() == 0):
+                param_norm = p.grad.data.norm(norm_type)
+                total_norm += param_norm.item()**norm_type
+        # Sum across all model parallel GPUs.
+        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
+        torch.distributed.all_reduce(
+            total_norm_cuda,
+            op=torch.distributed.ReduceOp.SUM,
+            group=get_model_parallel_group())
+        total_norm = total_norm_cuda[0].item()**(1. / norm_type)
+    clip_coef = max_norm / (total_norm + 1e-6)
+    if clip_coef < 1:
+        for p in parameters:
+            p.grad.data.mul_(clip_coef)
+    return total_norm
diff --git a/modelscope/models/nlp/mglm/mpu/initialize.py b/modelscope/models/nlp/mglm/mpu/initialize.py
new file mode 100644
index 00000000..33f8dbda
--- /dev/null
+++ b/modelscope/models/nlp/mglm/mpu/initialize.py
@@ -0,0 +1,130 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Model and data parallel groups."""
+
+import torch
+
+from .utils import ensure_divisibility
+
+# Model parallel group that the current rank belongs to.
+_MODEL_PARALLEL_GROUP = None
+# Data parallel group that the current rank belongs to.
+_DATA_PARALLEL_GROUP = None
+
+
+def initialize_model_parallel(model_parallel_size_):
+    """
+    Initialize model data parallel groups.
+
+    Arguments:
+        model_parallel_size: number of GPUs used to parallelize model.
+
+    Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we
+    use 2 GPUs to parallelize the model. The present function will
+    create 4 model parallel groups and 2 data parallel grous as:
+        4 model parallel groups:
+            [g0, g1], [g2, g3], [g4, g5], [g6, g7]
+        2 data parallel groups:
+            [g0, g2, g4, g6], [g1, g3, g5, g7]
+    Note that for efficiency, the caller should make sure adjacent ranks
+    are on the same DGX box. For example if we are using 2 DGX-1 boxes
+    with a total of 16 GPUs, rank 0 to 7 belong to the first box and
+    ranks 8 to 15 belong to the second box.
+    """
+    if torch.distributed.get_rank() == 0:
+        print('> initializing model parallel with size {}'.format(
+            model_parallel_size_))
+    # Get world size and rank. Ensure some consistencies.
+    assert torch.distributed.is_initialized()
+    world_size = torch.distributed.get_world_size()
+    model_parallel_size = min(model_parallel_size_, world_size)
+    ensure_divisibility(world_size, model_parallel_size)
+    rank = torch.distributed.get_rank()
+
+    # Build the data parallel groups.
+    global _DATA_PARALLEL_GROUP
+    assert _DATA_PARALLEL_GROUP is None, \
+        'data parallel group is already initialized'
+    for i in range(model_parallel_size):
+        ranks = range(i, world_size, model_parallel_size)
+        group = torch.distributed.new_group(ranks)
+        if i == (rank % model_parallel_size):
+            _DATA_PARALLEL_GROUP = group
+
+    # Build the model parallel groups.
+    global _MODEL_PARALLEL_GROUP
+    assert _MODEL_PARALLEL_GROUP is None, \
+        'model parallel group is already initialized'
+    for i in range(world_size // model_parallel_size):
+        ranks = range(i * model_parallel_size, (i + 1) * model_parallel_size)
+        group = torch.distributed.new_group(ranks)
+        if i == (rank // model_parallel_size):
+            _MODEL_PARALLEL_GROUP = group
+
+
+def model_parallel_is_initialized():
+    """Check if model and data parallel groups are initialized."""
+    if _MODEL_PARALLEL_GROUP is None or _DATA_PARALLEL_GROUP is None:
+        return False
+    return True
+
+
+def get_model_parallel_group():
+    """Get the model parallel group the caller rank belongs to."""
+    assert _MODEL_PARALLEL_GROUP is not None, \
+        'model parallel group is not initialized'
+    return _MODEL_PARALLEL_GROUP
+
+
+def get_data_parallel_group():
+    """Get the data parallel group the caller rank belongs to."""
+    assert _DATA_PARALLEL_GROUP is not None, \
+        'data parallel group is not initialized'
+    return _DATA_PARALLEL_GROUP
+
+
+def get_model_parallel_world_size():
+    """Return world size for the model parallel group."""
+    return torch.distributed.get_world_size(group=get_model_parallel_group())
+
+
+def get_model_parallel_rank():
+    """Return my rank for the model parallel group."""
+    return torch.distributed.get_rank(group=get_model_parallel_group())
+
+
+def get_model_parallel_src_rank():
+    """Calculate the global rank corresponding to a local rank zeor
+    in the model parallel group."""
+    global_rank = torch.distributed.get_rank()
+    local_world_size = get_model_parallel_world_size()
+    return (global_rank // local_world_size) * local_world_size
+
+
+def get_data_parallel_world_size():
+    """Return world size for the data parallel group."""
+    return torch.distributed.get_world_size(group=get_data_parallel_group())
+
+
+def get_data_parallel_rank():
+    """Return my rank for the data parallel group."""
+    return torch.distributed.get_rank(group=get_data_parallel_group())
+
+
+def destroy_model_parallel():
+    """Set the groups to none."""
+    global _MODEL_PARALLEL_GROUP
+    _MODEL_PARALLEL_GROUP = None
+    global _DATA_PARALLEL_GROUP
+    _DATA_PARALLEL_GROUP = None
diff --git a/modelscope/models/nlp/mglm/mpu/layers.py b/modelscope/models/nlp/mglm/mpu/layers.py
new file mode 100644
index 00000000..4eb94b50
--- /dev/null
+++ b/modelscope/models/nlp/mglm/mpu/layers.py
@@ -0,0 +1,357 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Parts of the code here are adapted from PyTorch
+# repo: https://github.com/pytorch/pytorch
+
+import math
+
+import torch
+import torch.nn.functional as F
+import torch.nn.init as init
+from apex.normalization.fused_layer_norm import FusedLayerNorm as LayerNorm
+from torch.nn.parameter import Parameter
+
+from .initialize import get_model_parallel_rank, get_model_parallel_world_size
+from .mappings import (copy_to_model_parallel_region,
+                       gather_from_model_parallel_region,
+                       reduce_from_model_parallel_region,
+                       scatter_to_model_parallel_region)
+from .random import get_cuda_rng_tracker
+from .utils import VocabUtility, divide, split_tensor_along_last_dim
+
+
+def _initialize_affine_weight(weight,
+                              output_size,
+                              input_size,
+                              per_partition_size,
+                              partition_dim,
+                              init_method,
+                              stride=1,
+                              return_master_weight=False):
+    """Initialize affine weight for model parallel.
+
+    Build the master weight on all processes and scatter
+    the relevant chunk."""
+    # If we only use 1 process for model parallelism, bypass scatter.
+    world_size = get_model_parallel_world_size()
+    if world_size == 1:
+        init_method(weight)
+        if return_master_weight:
+            return weight
+        return None
+
+    # Initialize master weight
+    master_weight = torch.empty(
+        output_size, input_size, dtype=weight.dtype, requires_grad=False)
+    init_method(master_weight)
+
+    # Split and copy
+    per_partition_per_stride_size = divide(per_partition_size, stride)
+    weight_list = torch.split(
+        master_weight, per_partition_per_stride_size, dim=partition_dim)
+    rank = get_model_parallel_rank()
+    my_weight_list = weight_list[rank::world_size]
+
+    with torch.no_grad():
+        torch.cat(my_weight_list, dim=partition_dim, out=weight)
+    if return_master_weight:
+        return master_weight
+    return None
+
+
+class VocabParallelEmbedding(torch.nn.Module):
+    """Embedding parallelized in the vocabulary dimension.
+
+    This is mainly adapted from torch.nn.Embedding and all the default
+    values are kept.
+    Arguments:
+        num_embeddings: vocabulary size.
+        embedding_dim: size of hidden state.
+        init_method: method to initialize weights.
+    """
+
+    def __init__(self,
+                 num_embeddings,
+                 embedding_dim,
+                 init_method=init.xavier_normal_):
+        super(VocabParallelEmbedding, self).__init__()
+        # Keep the input dimensions.
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        # Set the detauls for compatibility.
+        self.padding_idx = None
+        self.max_norm = None
+        self.norm_type = 2.
+        self.scale_grad_by_freq = False
+        self.sparse = False
+        self._weight = None
+        # Divide the weight matrix along the vocaburaly dimension.
+        self.vocab_start_index, self.vocab_end_index = \
+            VocabUtility.vocab_range_from_global_vocab_size(
+                self.num_embeddings, get_model_parallel_rank(),
+                get_model_parallel_world_size())
+        self.num_embeddings_per_partition = self.vocab_end_index - self.vocab_start_index  # noqa
+
+        # Allocate weights.
+        self.weight = Parameter(
+            torch.Tensor(self.num_embeddings_per_partition,
+                         self.embedding_dim))
+        self.weight.model_parallel = True
+        # And initialize.
+        _initialize_affine_weight(self.weight, self.num_embeddings,
+                                  self.embedding_dim,
+                                  self.num_embeddings_per_partition, 0,
+                                  init_method)
+
+    def forward(self, input_):
+        # Build the mask.
+        input_mask = (input_ < self.vocab_start_index) | \
+                     (input_ >= self.vocab_end_index)
+        # Mask the input.
+        masked_input = input_.clone() - self.vocab_start_index
+        masked_input[input_mask] = 0
+        # Get the embeddings.
+        output_parallel = F.embedding(masked_input, self.weight,
+                                      self.padding_idx, self.max_norm,
+                                      self.norm_type, self.scale_grad_by_freq,
+                                      self.sparse)
+        # Mask the output embedding.
+        output_parallel[input_mask, :] = 0.0
+        # Reduce across all the model parallel GPUs.
+        output = reduce_from_model_parallel_region(output_parallel)
+        return output
+
+
+class ParallelEmbedding(torch.nn.Module):
+    """Embedding parallelized in the embedding dimension.
+
+    This is mainly adapted from torch.nn.Embedding and all the default
+    values are kept.
+    Arguments:
+        num_embeddings: vocabulary size.
+        embedding_dim: size of hidden state.
+        init_method: method to initialize weights.
+    """
+
+    def __init__(self,
+                 num_embeddings,
+                 embedding_dim,
+                 init_method=init.xavier_normal_,
+                 keep_master_weight_for_test=False):
+        super(ParallelEmbedding, self).__init__()
+        # Keep the input dimensions.
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        # Set some detauls for compatibility.
+        self.padding_idx = None
+        self.max_norm = None
+        self.norm_type = 2.
+        self.scale_grad_by_freq = False
+        self.sparse = False
+        self._weight = None
+        # Divide the weight matrix along the embedding dimension.
+        world_size = get_model_parallel_world_size()
+        self.embedding_dim_per_partition = divide(self.embedding_dim,
+                                                  world_size)
+
+        # Allocate weights.
+        self.weight = Parameter(
+            torch.Tensor(self.num_embeddings,
+                         self.embedding_dim_per_partition))
+        self.weight.model_parallel = True
+        # And initialize.
+        _initialize_affine_weight(
+            self.weight,
+            self.num_embeddings,
+            self.embedding_dim,
+            self.embedding_dim_per_partition,
+            1,
+            init_method,
+            stride=1,
+            return_master_weight=False)
+
+    def forward(self, input_):
+        input_parallel = copy_to_model_parallel_region(input_)
+        output_parallel = F.embedding(input_parallel, self.weight,
+                                      self.padding_idx, self.max_norm,
+                                      self.norm_type, self.scale_grad_by_freq,
+                                      self.sparse)
+        output = gather_from_model_parallel_region(output_parallel)
+        return output
+
+
+class ColumnParallelLinear(torch.nn.Module):
+    """Linear layer with column parallelism.
+
+    The linear layer is defined as Y = XA + b. A is parallelized along
+    its second dimension as A = [A_1, ..., A_p].
+
+    Arguments:
+        input_size: first dimension of matrix A.
+        output_size: second dimension of matrix A.
+        bias: If true, add bias
+        gather_output: If true, call all-gether on output and make Y avaiable
+                       to all GPUs, otherwise, every GPU will have its output
+                       which is Y_i = XA_i
+        init_method: method to initialize weights. Note that bias is always set
+                     to zero.
+        stride: For the strided linear layers.
+        keep_master_weight_for_test: This was added for testing and should be
+                                     set to False. It returns the master weights
+                                     used for initialization.
+    """
+
+    def __init__(self,
+                 input_size,
+                 output_size,
+                 bias=True,
+                 gather_output=True,
+                 init_method=init.xavier_normal_,
+                 stride=1,
+                 keep_master_weight_for_test=False):
+        super(ColumnParallelLinear, self).__init__()
+
+        # Keep input parameters
+        self.input_size = input_size
+        self.output_size = output_size
+        self.gather_output = gather_output
+        # Divide the weight matrix along the last dimension.
+        world_size = get_model_parallel_world_size()
+        self.output_size_per_partition = divide(output_size, world_size)
+
+        # Parameters.
+        # Note: torch.nn.functional.linear performs XA^T + b and as a result
+        # we allocate the transpose.
+        self.weight = Parameter(
+            torch.Tensor(self.output_size_per_partition, self.input_size))
+        self.weight.model_parallel = True
+        if bias:
+            self.bias = Parameter(torch.Tensor(self.output_size_per_partition))
+            self.bias.model_parallel = True
+            # Always initialize bias to zero.
+            with torch.no_grad():
+                self.bias.zero_()
+        else:
+            self.register_parameter('bias', None)
+
+        # Initialize weight.
+        self.master_weight = _initialize_affine_weight(
+            self.weight,
+            self.output_size,
+            self.input_size,
+            self.output_size_per_partition,
+            0,
+            init_method,
+            stride=stride,
+            return_master_weight=keep_master_weight_for_test)
+
+    def forward(self, input_):
+        # Set up backprop all-reduce.
+        input_parallel = copy_to_model_parallel_region(input_)
+        # Matrix multiply.
+        output_parallel = F.linear(input_parallel, self.weight, self.bias)
+        if self.gather_output:
+            # All-gather across the partitions.
+            output = gather_from_model_parallel_region(output_parallel)
+        else:
+            output = output_parallel
+        return output
+
+
+class RowParallelLinear(torch.nn.Module):
+    """Linear layer with row parallelism.
+
+    The linear layer is defined as Y = XA + b. A is parallelized along
+    its first dimension and X along its second dimension as:
+               -   -
+              | A_1 |
+              | .   |
+          A = | .   |        X = [X_1, ..., X_p]
+              | .   |
+              | A_p |
+               -   -
+    Arguments:
+        input_size: first dimension of matrix A.
+        output_size: second dimension of matrix A.
+        bias: If true, add bias. Note that bias is not parallelized.
+        input_is_parallel: If true, we assume that the input is already
+                           split across the GPUs and we do not split
+                           again.
+        init_method: method to initialize weights. Note that bias is always set
+                     to zero.
+        stride: For the strided linear layers.
+        keep_master_weight_for_test: This was added for testing and should be
+                                     set to False. It returns the master weights
+                                     used for initialization.
+    """
+
+    def __init__(self,
+                 input_size,
+                 output_size,
+                 bias=True,
+                 input_is_parallel=False,
+                 init_method=init.xavier_normal_,
+                 stride=1,
+                 keep_master_weight_for_test=False):
+        super(RowParallelLinear, self).__init__()
+
+        # Keep input parameters
+        self.input_size = input_size
+        self.output_size = output_size
+        self.input_is_parallel = input_is_parallel
+        # Divide the weight matrix along the last dimension.
+        world_size = get_model_parallel_world_size()
+        self.input_size_per_partition = divide(input_size, world_size)
+
+        # Parameters.
+        # Note: torch.nn.functional.linear performs XA^T + b and as a result
+        # we allocate the transpose.
+        self.weight = Parameter(
+            torch.Tensor(self.output_size, self.input_size_per_partition))
+        self.weight.model_parallel = True
+        if bias:
+            self.bias = Parameter(torch.Tensor(self.output_size))
+            # Always initialize bias to zero.
+            with torch.no_grad():
+                self.bias.zero_()
+        else:
+            self.register_parameter('bias', None)
+
+        # Initialize weight.
+        self.master_weight = _initialize_affine_weight(
+            self.weight,
+            self.output_size,
+            self.input_size,
+            self.input_size_per_partition,
+            1,
+            init_method,
+            stride=stride,
+            return_master_weight=keep_master_weight_for_test)
+
+    def forward(self, input_):
+        # Set up backprop all-reduce.
+        if self.input_is_parallel:
+            input_parallel = input_
+        else:
+            input_parallel = scatter_to_model_parallel_region(input_)
+        # Matrix multiply.
+        output_parallel = F.linear(input_parallel, self.weight)
+        # All-reduce across all the partitions.
+        output_ = reduce_from_model_parallel_region(output_parallel)
+        if self.bias is not None:
+            output = output_ + self.bias
+        else:
+            output = output_
+        return output
diff --git a/modelscope/models/nlp/mglm/mpu/mappings.py b/modelscope/models/nlp/mglm/mpu/mappings.py
new file mode 100644
index 00000000..b3056dd7
--- /dev/null
+++ b/modelscope/models/nlp/mglm/mpu/mappings.py
@@ -0,0 +1,144 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from .initialize import get_model_parallel_group
+from .utils import split_tensor_along_last_dim
+
+
+def _reduce(input_):
+    """All-reduce the the input tensor across model parallel group."""
+    group = get_model_parallel_group()
+
+    # Bypass the function if we are using only 1 GPU.
+    if torch.distributed.get_world_size(group=group) == 1:
+        return input_
+
+    # All-reduce.
+    torch.distributed.all_reduce(input_, group=group)
+
+    return input_
+
+
+def _split(input_):
+    """Split the tensor along its last dimension and keep the
+    corresponding slice."""
+    group = get_model_parallel_group()
+
+    # Bypass the function if we are using only 1 GPU.
+    if torch.distributed.get_world_size(group=group) == 1:
+        return input_
+
+    # Split along last dimension.
+    world_size = torch.distributed.get_world_size(group=group)
+    input_list = split_tensor_along_last_dim(input_, world_size)
+
+    # Note: torch.split does not create contiguous tensors by default.
+    rank = torch.distributed.get_rank(group=group)
+    output = input_list[rank].contiguous()
+
+    return output
+
+
+def _gather(input_):
+    """Gather tensors and concatinate along the last dimension."""
+    group = get_model_parallel_group()
+
+    # Bypass the function if we are using only 1 GPU.
+    if torch.distributed.get_world_size(group=group) == 1:
+        return input_
+
+    # Size and dimension.
+    last_dim = input_.dim() - 1
+    rank = torch.distributed.get_rank(group=group)
+    world_size = torch.distributed.get_world_size(group=group)
+
+    tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
+    tensor_list[rank] = input_
+    torch.distributed.all_gather(tensor_list, input_, group=group)
+
+    # Note: torch.cat already creates a contiguous tensor.
+    output = torch.cat(tensor_list, dim=last_dim).contiguous()
+
+    return output
+
+
+class _CopyToModelParallelRegion(torch.autograd.Function):
+    """Pass the input to the model parallel region."""
+
+    @staticmethod
+    def forward(ctx, input_):
+        return input_
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _reduce(grad_output)
+
+
+class _ReduceFromModelParallelRegion(torch.autograd.Function):
+    """All-redcue the input from the model parallel region."""
+
+    @staticmethod
+    def forward(ctx, input_):
+        return _reduce(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return grad_output
+
+
+class _ScatterToModelParallelRegion(torch.autograd.Function):
+    """Split the input and keep only the corresponding chuck to the rank."""
+
+    @staticmethod
+    def forward(ctx, input_):
+        return _split(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _gather(grad_output)
+
+
+class _GatherFromModelParallelRegion(torch.autograd.Function):
+    """Gather the input from model parallel region and concatinate."""
+
+    @staticmethod
+    def forward(ctx, input_):
+        return _gather(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _split(grad_output)
+
+
+# -----------------
+# Helper functions.
+# -----------------
+
+
+def copy_to_model_parallel_region(input_):
+    return _CopyToModelParallelRegion.apply(input_)
+
+
+def reduce_from_model_parallel_region(input_):
+    return _ReduceFromModelParallelRegion.apply(input_)
+
+
+def scatter_to_model_parallel_region(input_):
+    return _ScatterToModelParallelRegion.apply(input_)
+
+
+def gather_from_model_parallel_region(input_):
+    return _GatherFromModelParallelRegion.apply(input_)
diff --git a/modelscope/models/nlp/mglm/mpu/random.py b/modelscope/models/nlp/mglm/mpu/random.py
new file mode 100755
index 00000000..2cdf236d
--- /dev/null
+++ b/modelscope/models/nlp/mglm/mpu/random.py
@@ -0,0 +1,408 @@
+# Modified by Samyam Rajbhandari
+# Used to partition the activations stored for backward propagation
+# Therefore reduces the memory consumption
+
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Parts of the code here are adapted from PyTorch
+# repo: https://github.com/pytorch/pytorch
+import contextlib
+
+import torch
+import torch.distributed as dist
+from torch import _C
+from torch.cuda import _lazy_call
+from torch.cuda import device as device_ctx_manager
+
+from .initialize import (get_data_parallel_rank, get_model_parallel_group,
+                         get_model_parallel_rank,
+                         get_model_parallel_world_size)
+
+# from torch.utils.checkpoint import detach_variable
+
+PARTITION_ACTIVATIONS = False
+PA_CORRECTNESS_TEST = False
+
+
+def see_memory_usage(message, force=False):
+    if not force:
+        return
+    dist.barrier()
+    if dist.get_rank() == 0:
+        print(message)
+        print('Memory Allocated ',
+              torch.cuda.memory_allocated() / (1024 * 1024 * 1024),
+              'GigaBytes')
+        print('Max Memory Allocated ',
+              torch.cuda.max_memory_allocated() / (1024 * 1024 * 1024),
+              'GigaBytes')
+        print('Cache Allocated ',
+              torch.cuda.memory_cached() / (1024 * 1024 * 1024), 'GigaBytes')
+        print('Max cache Allocated ',
+              torch.cuda.max_memory_cached() / (1024 * 1024 * 1024),
+              'GigaBytes')
+        print(' ')
+        # input("Press Any Key To Continue ..")
+
+
+mp_rank = None  # get_model_parallel_rank()
+mp_size = None  # get_model_parallel_world_size()
+mp_group = None  # get_model_parallel_group()
+
+# Default name for the model parallel rng tracker.
+_MODEL_PARALLEL_RNG_TRACKER_NAME = 'model-parallel-rng'
+transport_stream = None
+cuda_device = None
+
+
+def detach_variable(inputs, device=None):
+    if isinstance(inputs, tuple):
+        out = []
+        for inp in inputs:
+            if not isinstance(inp, torch.Tensor):
+                out.append(inp)
+                continue
+
+            requires_grad = inp.requires_grad
+
+            if device is not None:
+                x = inp.to(device=device)
+            else:
+                x = inp
+
+            x = x.detach()
+            x.requires_grad = requires_grad
+            out.append(x)
+        return tuple(out)
+    else:
+        raise RuntimeError(
+            'Only tuple of tensors is supported. Got Unsupported input type: ',
+            type(inputs).__name__)
+
+
+def _set_cuda_rng_state(new_state, device=-1):
+    """Sets the random number generator state of the current GPU.
+
+    Argumentss:
+        new_state (torch.ByteTensor): The desired state
+    This function is adapted from PyTorch repo (torch.cuda.set_rng_state)
+    with a single change: the input state is not cloned. Cloning caused
+    major performance issues for +4 GPU cases.
+    """
+    if hasattr(_C, '_cuda_setRNGState') and callable(_C._cuda_setRNGState):
+        # older PyTorch
+        def cb():
+            with device_ctx_manager(device):
+                _C._cuda_setRNGState(new_state)
+    else:
+        # newer PyTorch
+        if device == -1:
+            device = torch.device('cuda')
+        elif isinstance(device, str):
+            device = torch.device(device)
+        elif isinstance(device, int):
+            device = torch.device('cuda', device)
+
+        def cb():
+            idx = device.index
+            if idx is None:
+                idx = torch.cuda.current_device()
+            default_generator = torch.cuda.default_generators[idx]
+            default_generator.set_state(new_state)
+
+    _lazy_call(cb)
+
+
+class CudaRNGStatesTracker:
+    """Tracker for the cuda RNG states.
+
+    Using the `add` method, a cuda rng state is initialized based on
+    the input `seed` and is assigned to `name`. Later, by forking the
+    rng state, we can perform operations and return to our starting
+    cuda state.
+    """
+
+    def __init__(self):
+        # Map from a string name to the cuda rng state.
+        self.states_ = {}
+        # Seeds are just for book keeping and ensure no seed is set twice.
+        self.seeds_ = set()
+
+    def reset(self):
+        """Set to the initial state (no tracker)."""
+        self.states_ = {}
+        self.seeds_ = set()
+
+    def get_states(self):
+        """Get rng states. Copy the dictionary so we have direct
+        pointers to the states, not just a pointer to the dictionary."""
+        states = {}
+        for name in self.states_:
+            states[name] = self.states_[name]
+        return states
+
+    def set_states(self, states):
+        """Set the rng states. For efficiency purposes, we do not check
+        the size of seed for compatibility."""
+        self.states_ = states
+
+    def add(self, name, seed):
+        """Track the rng state."""
+        # Check seed is not already used.
+        if seed in self.seeds_:
+            raise Exception('seed {} already exists'.format(seed))
+        self.seeds_.add(seed)
+        # Check that state is not already defined.
+        if name in self.states_:
+            raise Exception('cuda rng state {} already exists'.format(name))
+        # Get the current rng state.
+        orig_rng_state = torch.cuda.get_rng_state()
+        # Set the new state and store it.
+        torch.cuda.manual_seed(seed)
+        self.states_[name] = torch.cuda.get_rng_state()
+        # Reset rng state to what it was.
+        _set_cuda_rng_state(orig_rng_state)
+
+    @contextlib.contextmanager
+    def fork(self, name=_MODEL_PARALLEL_RNG_TRACKER_NAME):
+        """Fork the cuda rng state, perform operations, and exit with
+        the original state."""
+        # Check if we have added the state
+        if name not in self.states_:
+            raise Exception('cuda rng state {} is not added'.format(name))
+        # Store current rng state.
+        orig_cuda_rng_state = torch.cuda.get_rng_state()
+        # Set rng state to the desired one
+        _set_cuda_rng_state(self.states_[name])
+        # Do the stuff we wanted to do.
+        try:
+            yield
+        finally:
+            # Update the current rng state for later use.
+            self.states_[name] = torch.cuda.get_rng_state()
+            # And set the state to the original state we started with.
+            _set_cuda_rng_state(orig_cuda_rng_state)
+
+
+# RNG tracker object.
+_CUDA_RNG_STATE_TRACKER = CudaRNGStatesTracker()
+
+
+def get_cuda_rng_tracker():
+    """Get cuda rng tracker."""
+    return _CUDA_RNG_STATE_TRACKER
+
+
+def model_parallel_cuda_manual_seed(seed):
+    """Initialize model parallel cuda seed.
+
+    This function should be called after the model parallel is
+    initialized. Also, no torch.cuda.manual_seed should be called
+    after this function. Basically, this is replacement for that
+    function.
+    Two set of RNG states are tracked:
+        default state: This is for data parallelism and is the same among a
+                       set of model parallel GPUs but different across
+                       different model paralle groups. This is used for
+                       example for dropout in the non-model-parallel regions.
+        model-parallel state: This state is different among a set of model
+                              parallel GPUs, but the same across data parallel
+                              groups. This is used for example for dropout in
+                              model parallel regions.
+    """
+    # 2718 is just for fun and any POSITIVE value will work.
+    offset = seed + 2718
+    model_parallel_seed = offset + get_model_parallel_rank()
+    # Data parallel gets the original sedd.
+    data_parallel_seed = seed
+
+    if torch.distributed.get_rank() == 0:
+        print(
+            '> initializing model parallel cuda seeds on global rank {}, '
+            'model parallel rank {}, and data parallel rank {} with '
+            'model parallel seed: {} and data parallel seed: {}'.format(
+                torch.distributed.get_rank(), get_model_parallel_rank(),
+                get_data_parallel_rank(), model_parallel_seed,
+                data_parallel_seed),
+            flush=True)
+    _CUDA_RNG_STATE_TRACKER.reset()
+    # Set the default state.
+    torch.cuda.manual_seed(data_parallel_seed)
+    # and model parallel state.
+    _CUDA_RNG_STATE_TRACKER.add(_MODEL_PARALLEL_RNG_TRACKER_NAME,
+                                model_parallel_seed)
+
+
+def get_partition_start(item):
+    global mp_rank, mp_size, mp_group
+    partition_size = get_partition_size(item)
+    start = partition_size * mp_rank
+    return int(start)
+
+
+def get_partition_size(item):
+    global mp_rank, mp_size, mp_group
+    size = item.numel()
+    partition_size = size / mp_size
+    return int(partition_size)
+
+
+def get_full_inputs(tensors):
+    inputs = []
+    for i in range(int(len(tensors) / 2) - 1):
+        item = tensors[2 * i]
+        size = tensors[2 * i + 1]
+        partition_size = item.numel()
+        tensor_size = partition_size * mp_size
+        flat_tensor = torch.zeros([tensor_size],
+                                  dtype=item.dtype,
+                                  device=item.device)
+        partitions = []
+        for i in range(mp_size):
+            part_i = flat_tensor.narrow(0, partition_size * i, partition_size)
+            if i == mp_rank:
+                part_i.copy_(item)
+            partitions.append(part_i)
+        dist.all_gather(partitions, partitions[mp_rank], group=mp_group)
+        input_tensor = flat_tensor.view(list(size.numpy()))
+        item.data = input_tensor.data
+
+        inputs.append(item)
+    inputs.append(tensors[-2])
+
+    return tuple(inputs)
+
+
+class CheckpointFunction(torch.autograd.Function):
+    """This function is adapted from torch.utils.checkpoint with
+       two main changes:
+           1) torch.cuda.set_rng_state is replaced with `_set_cuda_rng_state`
+           2) the states in the model parallel tracker are also properly
+              tracked/set/reset.
+    """
+
+    @staticmethod
+    def forward(ctx, run_function, *args):
+        ctx.run_function = run_function
+        global mp_rank, mp_size, mp_group
+        if mp_rank is None:
+            mp_rank = get_model_parallel_rank()
+            mp_size = get_model_parallel_world_size()
+            mp_group = get_model_parallel_group()
+
+        global cuda_device, transport_stream, PARTITION_ACTIVATIONS
+        if cuda_device is None:
+            if dist.get_rank() == 0:
+                print(
+                    f'Partition Activations {PARTITION_ACTIVATIONS} and Correctness Check {PA_CORRECTNESS_TEST}'
+                )
+
+            cuda_device = torch.cuda.current_device()
+            # The transport stream is used to overlap the allgather communication for the activations
+            # with the computation in the backward pass
+            transport_stream = torch.cuda.Stream(device=cuda_device)
+
+        if PARTITION_ACTIVATIONS:
+            inputs = [
+                item.detach().contiguous().view(-1).narrow(
+                    0, get_partition_start(item),
+                    get_partition_size(item)).clone() for item in args[:-1]
+            ]
+            inputs.append(args[-1])
+
+        # just in case something funky is happening such as reuse of inputs
+        inputs_cuda = [item.to(cuda_device) for item in args]
+
+        # Copy the rng states.
+        ctx.fwd_cpu_rng_state = torch.get_rng_state()
+        ctx.fwd_cuda_rng_state = torch.cuda.get_rng_state()
+        ctx.fwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states()
+
+        # ctx.save_for_backward(*args)
+        with torch.no_grad():
+            outputs = run_function(*inputs_cuda)
+
+        del inputs_cuda
+
+        if PARTITION_ACTIVATIONS:
+            new_args = []
+            for arg, inp in zip(args, inputs):
+                size = torch.tensor(arg.size())
+                arg.data = inp.data
+                new_args.append(arg)
+                new_args.append(size)
+            ctx.save_for_backward(*new_args)
+        else:
+            ctx.save_for_backward(*args)
+
+        return outputs
+
+    @staticmethod
+    def backward(ctx, *args):
+        if not torch.autograd._is_checkpoint_valid():
+            raise RuntimeError('Checkpointing is not compatible with .grad(), '
+                               'please use .backward() if possible')
+
+        global cuda_device, transport_stream, PARTITION_ACTIVATIONS
+
+        if PARTITION_ACTIVATIONS:
+            with torch.cuda.stream(transport_stream):
+                inputs = get_full_inputs(ctx.saved_tensors)
+                detached_inputs = detach_variable(inputs)
+        else:
+            inputs = ctx.saved_tensors
+            detached_inputs = detach_variable(inputs)
+
+        # Store the current states.
+        bwd_cpu_rng_state = torch.get_rng_state()
+        bwd_cuda_rng_state = torch.cuda.get_rng_state()
+        bwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states()
+
+        # Set the states to what it used to be before the forward pass.
+        torch.set_rng_state(ctx.fwd_cpu_rng_state)
+        _set_cuda_rng_state(ctx.fwd_cuda_rng_state)
+        get_cuda_rng_tracker().set_states(ctx.fwd_cuda_rng_state_tracker)
+
+        if PARTITION_ACTIVATIONS:
+            current_stream = torch.cuda.current_stream()
+            current_stream.wait_stream(transport_stream)
+
+        with torch.enable_grad():
+            outputs = ctx.run_function(*detached_inputs)
+
+        # Set the states back to what it was at the start of this function.
+        torch.set_rng_state(bwd_cpu_rng_state)
+        _set_cuda_rng_state(bwd_cuda_rng_state)
+        get_cuda_rng_tracker().set_states(bwd_cuda_rng_state_tracker)
+
+        if isinstance(outputs, torch.Tensor):
+            outputs = (outputs, )
+        torch.autograd.backward(outputs, args)
+        return (None, ) + tuple(inp.grad for inp in detached_inputs)
+
+
+def checkpoint(function, *args):
+    """Checkpoint a model or part of the model.
+    This has been directly copied from torch.utils.checkpoint."""
+    return CheckpointFunction.apply(function, *args)
+
+
+def partition_activations_in_checkpoint(partition_activation):
+    global PARTITION_ACTIVATIONS
+    PARTITION_ACTIVATIONS = partition_activation
+    if dist.get_rank() == 0:
+        print(
+            f'**************Partition Activations {PARTITION_ACTIVATIONS}************'
+        )
diff --git a/modelscope/models/nlp/mglm/mpu/tests/__init__.py b/modelscope/models/nlp/mglm/mpu/tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/nlp/mglm/mpu/tests/commons.py b/modelscope/models/nlp/mglm/mpu/tests/commons.py
new file mode 100644
index 00000000..ecfd5e72
--- /dev/null
+++ b/modelscope/models/nlp/mglm/mpu/tests/commons.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import random
+
+import mpu
+import numpy
+import torch
+
+
+class IdentityLayer(torch.nn.Module):
+
+    def __init__(self, size, scale=1.0):
+        super(IdentityLayer, self).__init__()
+        self.weight = torch.nn.Parameter(scale * torch.randn(size))
+
+    def forward(self):
+        return self.weight
+
+
+def set_random_seed(seed):
+    """Set random seed for reproducability."""
+    random.seed(seed)
+    numpy.random.seed(seed)
+    torch.manual_seed(seed)
+    mpu.model_parallel_cuda_manual_seed(seed)
+
+
+def initialize_distributed(backend='nccl'):
+    """Initialize torch.distributed."""
+    # Get local rank in case it is provided.
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--local_rank',
+        type=int,
+        default=None,
+        help='local rank passed from distributed launcher')
+    args = parser.parse_args()
+    local_rank = args.local_rank
+
+    # Get rank and world size.
+    rank = int(os.getenv('RANK', '0'))
+    world_size = int(os.getenv('WORLD_SIZE', '1'))
+
+    print('> initializing torch.distributed with local rank: {}, '
+          'rank: {}, world size: {}'.format(local_rank, rank, world_size))
+
+    # Set the device id.
+    device = rank % torch.cuda.device_count()
+    if local_rank is not None:
+        device = local_rank
+    torch.cuda.set_device(device)
+
+    # Call the init process.
+    init_method = 'tcp://'
+    master_ip = os.getenv('MASTER_ADDR', 'localhost')
+    master_port = os.getenv('MASTER_PORT', '6000')
+    init_method += master_ip + ':' + master_port
+    torch.distributed.init_process_group(
+        backend=backend,
+        world_size=world_size,
+        rank=rank,
+        init_method=init_method)
+
+
+def print_separator(message):
+    torch.distributed.barrier()
+    filler_len = (78 - len(message)) // 2
+    filler = '-' * filler_len
+    string = '\n' + filler + ' {} '.format(message) + filler
+    if torch.distributed.get_rank() == 0:
+        print(string, flush=True)
+    torch.distributed.barrier()
diff --git a/modelscope/models/nlp/mglm/mpu/tests/test_cross_entropy.py b/modelscope/models/nlp/mglm/mpu/tests/test_cross_entropy.py
new file mode 100644
index 00000000..47fd1d7e
--- /dev/null
+++ b/modelscope/models/nlp/mglm/mpu/tests/test_cross_entropy.py
@@ -0,0 +1,106 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import sys
+
+import mpu
+import torch
+import torch.nn.functional as F
+from commons import (IdentityLayer, initialize_distributed, print_separator,
+                     set_random_seed)
+from mpu.cross_entropy import vocab_parallel_cross_entropy
+
+sys.path.append('../..')
+
+
+def torch_cross_entropy(batch_size, seq_length, vocab_size, logits_scale,
+                        seed):
+    set_random_seed(seed)
+    identity = IdentityLayer((batch_size, seq_length, vocab_size),
+                             scale=logits_scale).cuda()
+    logits = identity()
+    target = torch.cuda.LongTensor(size=(batch_size,
+                                         seq_length)).random_(0, vocab_size)
+    loss = F.cross_entropy(
+        logits.view(-1,
+                    logits.size()[-1]), target.view(-1),
+        reduction='none').view_as(target).mean()
+    loss.backward()
+    return loss, identity.weight.grad
+
+
+def mpu_cross_entropy(batch_size, seq_length, vocab_size, logits_scale, seed):
+    set_random_seed(seed)
+    identity = IdentityLayer((batch_size, seq_length, vocab_size),
+                             scale=logits_scale).cuda()
+    logits = identity()
+    logits_parallel = mpu.scatter_to_model_parallel_region(logits)
+    target = torch.cuda.LongTensor(size=(batch_size,
+                                         seq_length)).random_(0, vocab_size)
+    loss = vocab_parallel_cross_entropy(logits_parallel, target).mean()
+    loss.backward()
+    return loss, identity.weight.grad
+
+
+def test_cross_entropy(model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing cross entropy with model parallel size {} ...'.format(
+            model_parallel_size))
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    batch_size = 13
+    seq_length = 17
+    vocab_size_per_partition = 11
+    logits_scale = 1000.0
+    vocab_size = vocab_size_per_partition * model_parallel_size
+    seed = 1234
+
+    loss_torch, grad_torch = torch_cross_entropy(batch_size, seq_length,
+                                                 vocab_size, logits_scale,
+                                                 seed)
+    loss_mpu, grad_mpu = mpu_cross_entropy(batch_size, seq_length, vocab_size,
+                                           logits_scale, seed)
+
+    error = loss_torch.sub_(loss_mpu).abs().max()
+    print('   max error in loss on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    error = grad_torch.sub_(grad_mpu).abs().max()
+    print('   max error in grad on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+if __name__ == '__main__':
+
+    initialize_distributed()
+    world_size = torch.distributed.get_world_size()
+
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        print_separator('test cross entropy')
+        test_cross_entropy(model_parallel_size)
+        model_parallel_size *= 2
diff --git a/modelscope/models/nlp/mglm/mpu/tests/test_data.py b/modelscope/models/nlp/mglm/mpu/tests/test_data.py
new file mode 100644
index 00000000..66575300
--- /dev/null
+++ b/modelscope/models/nlp/mglm/mpu/tests/test_data.py
@@ -0,0 +1,91 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import functools
+import operator
+import sys
+
+import mpu
+import torch
+from commons import initialize_distributed, print_separator
+from mpu import data as data_utils
+
+sys.path.append('../..')
+
+
+def test_boradcast_data(model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print(
+            '> testing boradcast_data with model parallel size {} ...'.format(
+                model_parallel_size))
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    torch.manual_seed(1234 + mpu.get_data_parallel_rank())
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    key_size_t = {
+        'key1': [7, 11],
+        'key2': [8, 2, 1],
+        'key3': [13],
+        'key4': [5, 1, 2],
+        'key5': [5, 12]
+    }
+    keys = list(key_size_t.keys())
+
+    data = {}
+    data_t = {}
+    for key in key_size_t:
+        data[key] = torch.LongTensor(size=key_size_t[key]).random_(0, 1000)
+        data_t[key] = data[key].clone()
+    data['keyX'] = torch.FloatTensor(size=(5, )).random_(0, 1000)
+    data_t['keyX'] = data['keyX'].clone()
+    if mpu.get_model_parallel_rank() != 0:
+        data = None
+
+    data_utils._check_data_types(keys, data_t, torch.int64)
+    key_size, key_numel, \
+        total_numel = data_utils._build_key_size_numel_dictionaries(keys, data)
+    for key in keys:
+        assert key_size[key] == key_size_t[key]
+    total_numel_t = 0
+    for key in keys:
+        target_size = functools.reduce(operator.mul, key_size_t[key], 1)
+        assert key_numel[key] == target_size
+        total_numel_t += target_size
+    assert total_numel == total_numel_t
+
+    data_b = data_utils.broadcast_data(keys, data, torch.int64)
+    for key in keys:
+        tensor = data_t[key].cuda()
+        assert data_b[key].sub(tensor).abs().max() == 0
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+if __name__ == '__main__':
+
+    initialize_distributed()
+    world_size = torch.distributed.get_world_size()
+
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        print_separator('test test boradcast data')
+        test_boradcast_data(model_parallel_size)
+        model_parallel_size *= 2
diff --git a/modelscope/models/nlp/mglm/mpu/tests/test_initialize.py b/modelscope/models/nlp/mglm/mpu/tests/test_initialize.py
new file mode 100644
index 00000000..df62d213
--- /dev/null
+++ b/modelscope/models/nlp/mglm/mpu/tests/test_initialize.py
@@ -0,0 +1,95 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+
+import mpu
+import torch
+from commons import initialize_distributed, print_separator
+
+sys.path.append('../..')
+
+
+def test_initialize_model_parallel(model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing initialize_model_parallel with size {} ...'.format(
+            model_parallel_size))
+    model_parallel_size_ = min(model_parallel_size,
+                               torch.distributed.get_world_size())
+    assert not mpu.model_parallel_is_initialized()
+    mpu.initialize_model_parallel(model_parallel_size_)
+    assert mpu.model_parallel_is_initialized()
+
+    # Checks.
+    def check(group, world_size, rank):
+        assert world_size == torch.distributed.get_world_size(group=group)
+        assert rank == torch.distributed.get_rank(group=group)
+
+    # Model parallel.
+    world_size = model_parallel_size_
+    rank = torch.distributed.get_rank() % model_parallel_size_
+    assert world_size == mpu.get_model_parallel_world_size()
+    assert rank == mpu.get_model_parallel_rank()
+    check(mpu.get_model_parallel_group(), world_size, rank)
+
+    # Data parallel.
+    world_size = torch.distributed.get_world_size() // model_parallel_size_
+    rank = torch.distributed.get_rank() // model_parallel_size
+    assert world_size == mpu.get_data_parallel_world_size()
+    assert rank == mpu.get_data_parallel_rank()
+    check(mpu.get_data_parallel_group(), world_size, rank)
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+def test_get_model_parallel_src_rank(model_parallel_size_):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing get_model_parallel_src_rank with size {} ...'.format(
+            model_parallel_size_))
+    model_parallel_size = min(model_parallel_size_,
+                              torch.distributed.get_world_size())
+    assert not mpu.model_parallel_is_initialized()
+    mpu.initialize_model_parallel(model_parallel_size)
+    assert mpu.model_parallel_is_initialized()
+
+    # Checks
+    src_rank = torch.distributed.get_rank() - mpu.get_model_parallel_rank()
+    assert mpu.get_model_parallel_src_rank() == src_rank
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+if __name__ == '__main__':
+
+    initialize_distributed()
+    world_size = torch.distributed.get_world_size()
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        print_separator('test initialize model parallel')
+        test_initialize_model_parallel(model_parallel_size)
+        print_separator('test model parallel source rank')
+        test_get_model_parallel_src_rank(model_parallel_size)
+        model_parallel_size *= 2
diff --git a/modelscope/models/nlp/mglm/mpu/tests/test_layers.py b/modelscope/models/nlp/mglm/mpu/tests/test_layers.py
new file mode 100644
index 00000000..2dbc987a
--- /dev/null
+++ b/modelscope/models/nlp/mglm/mpu/tests/test_layers.py
@@ -0,0 +1,533 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import sys
+
+import mpu
+import torch
+import torch.nn.init as init
+from commons import initialize_distributed, print_separator, set_random_seed
+from mpu import layers
+from torch.nn.parameter import Parameter
+
+sys.path.append('../..')
+
+
+def test_parallel_embedding(model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing parallel embedding with model parallel size {} ...'.
+              format(model_parallel_size))
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    batch_size = 17
+    seq_length = 23
+    vocab_size = 48
+    hidden_size = 16
+    seed = 1236
+
+    set_random_seed(123)
+    input_data = torch.LongTensor(size=(batch_size, seq_length)).random_(
+        0, vocab_size).cuda()
+    loss_weight = torch.randn([batch_size, seq_length, hidden_size]).cuda()
+
+    set_random_seed(seed)
+    embedding_original = torch.nn.Embedding(vocab_size, hidden_size).cuda()
+
+    output = embedding_original(input_data)
+    loss_original = torch.mul(output, loss_weight).sum()
+    loss_original.backward()
+
+    set_random_seed(seed)
+    embedding_parallel = layers.ParallelEmbedding(
+        vocab_size, hidden_size, init_method=init.normal_).cuda()
+    output = embedding_parallel(input_data)
+    loss_parallel = torch.mul(output, loss_weight).sum()
+    loss_parallel.backward()
+
+    set_random_seed(seed)
+    embedding_vocab_parallel = layers.VocabParallelEmbedding(
+        vocab_size, hidden_size, init_method=init.normal_).cuda()
+    output = embedding_vocab_parallel(input_data)
+    loss_vocab_parallel = torch.mul(output, loss_weight).sum()
+    loss_vocab_parallel.backward()
+
+    torch.distributed.barrier()
+    error = loss_parallel.sub(loss_original).abs()
+    print('   error in loss (parallel) on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-12, 'error: {}'.format(error)
+
+    torch.distributed.barrier()
+    error = loss_vocab_parallel.sub(loss_original).abs()
+    print('   error in loss (vocab parallel) on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-12, 'error: {}'.format(error)
+
+    weight_grad_orig = torch.split(embedding_original.weight.grad,
+                                   hidden_size // model_parallel_size,
+                                   1)[mpu.get_model_parallel_rank()]
+    error = embedding_parallel.weight.grad.sub(weight_grad_orig).abs().max()
+    print('   error in grad (parallel) on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-12, 'error: {}'.format(error)
+
+    weight_grad_orig = torch.split(embedding_original.weight.grad,
+                                   vocab_size // model_parallel_size,
+                                   0)[mpu.get_model_parallel_rank()]
+    error = embedding_vocab_parallel.weight.grad.sub(
+        weight_grad_orig).abs().max()
+    print('   error in grad (vocab parallel) on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-12, 'error: {}'.format(error)
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+def test_initialize_affine_weight(model_parallel_size):
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    if torch.distributed.get_rank() == 0:
+        print('> testing initialize_affine_weight with model parallel '
+              'size: {}'.format(model_parallel_size))
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    seed = 12345
+    input_size_coeff = 13
+    input_size = input_size_coeff * model_parallel_size
+    output_size_coeff = 17
+    output_size = output_size_coeff * model_parallel_size
+
+    # ---------------
+    # Column parallel
+    # ---------------
+    weight = torch.empty(output_size_coeff, input_size)
+    set_random_seed(seed)
+    layers._initialize_affine_weight(weight, output_size, input_size,
+                                     output_size_coeff, 0,
+                                     torch.nn.init.normal_)
+    # Target.
+    set_random_seed(seed)
+    master_weight = torch.empty(output_size, input_size)
+    torch.nn.init.normal_(master_weight)
+    rank = mpu.get_model_parallel_rank()
+    my_weight = torch.split(
+        master_weight, output_size_coeff, dim=0)[rank].contiguous().clone()
+
+    # Compare.
+    error = weight.sub(my_weight).abs().max()
+    torch.distributed.barrier()
+    print('   column parallel max error (should be zero) on global rank '
+          '{}: {}'.format(torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    # ------------
+    # Row parallel
+    # ------------
+    weight = torch.empty(output_size, input_size_coeff)
+    set_random_seed(seed)
+    mpu.layers._initialize_affine_weight(weight, output_size, input_size,
+                                         input_size_coeff, 1,
+                                         torch.nn.init.normal_)
+    # Target.
+    set_random_seed(seed)
+    master_weight = torch.empty(output_size, input_size)
+    torch.nn.init.normal_(master_weight)
+    rank = mpu.get_model_parallel_rank()
+    my_weight = torch.split(
+        master_weight, input_size_coeff, dim=1)[rank].contiguous().clone()
+
+    # Compare.
+    error = weight.sub(my_weight).abs().max()
+    torch.distributed.barrier()
+    print('   row parallel max error (should be zero) on global rank '
+          '{}: {}'.format(torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print(' >> passed the test :-)')
+
+
+class IdentityLayer2D(torch.nn.Module):
+
+    def __init__(self, m, n):
+        super(IdentityLayer2D, self).__init__()
+        self.weight = Parameter(torch.Tensor(m, n))
+        torch.nn.init.xavier_normal_(self.weight)
+
+    def forward(self):
+        return self.weight
+
+
+def test_column_parallel_linear(model_parallel_size):
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    if torch.distributed.get_rank() == 0:
+        print('> testing ColumnParallelLinear with model parallel '
+              'size: {}'.format(model_parallel_size))
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    seed = 12345
+    set_random_seed(seed)
+    input_size_coeff = 13
+    input_size = input_size_coeff * model_parallel_size
+    output_size_coeff = 17
+    output_size = output_size_coeff * model_parallel_size
+    batch_size = 7
+
+    # Network
+    identity_layer = IdentityLayer2D(batch_size, input_size).cuda()
+    linear_layer = mpu.ColumnParallelLinear(
+        input_size, output_size, keep_master_weight_for_test=True).cuda()
+    loss_weight = torch.randn([batch_size, output_size]).cuda()
+    # Forward
+    input_ = identity_layer()
+    output = linear_layer(input_)
+    loss = torch.mul(output, loss_weight).sum()
+    # Backward
+    loss.backward()
+
+    # Values.
+    dLdY = loss_weight
+    X = identity_layer.weight
+    A = linear_layer.master_weight.cuda()
+    dLdA = torch.matmul(dLdY.t(), X)
+    dLdb = torch.matmul(torch.ones(batch_size, 1).cuda().t(), dLdY).view(-1)
+    dLdX = torch.matmul(dLdY, A)
+
+    rank = mpu.get_model_parallel_rank()
+    my_dLdA = torch.split(
+        dLdA, output_size_coeff, dim=0)[rank].contiguous().clone()
+    error = my_dLdA.sub(linear_layer.weight.grad).abs().max()
+    torch.distributed.barrier()
+    print('   error in dLdA on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    my_dLdb = torch.split(
+        dLdb, output_size_coeff, dim=0)[rank].contiguous().clone()
+    error = my_dLdb.sub(linear_layer.bias.grad).abs().max()
+    torch.distributed.barrier()
+    print('   error in dLdb on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    error = dLdX.sub(identity_layer.weight.grad).abs().max()
+    torch.distributed.barrier()
+    print('   error in dLdX on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print(' >> passed the test :-)')
+
+
+def test_row_parallel_linear(model_parallel_size):
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    if torch.distributed.get_rank() == 0:
+        print('> testing RowParallelLinear with model parallel '
+              'size: {}'.format(model_parallel_size))
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    seed = 12345
+    set_random_seed(seed)
+    input_size_coeff = 13
+    input_size = input_size_coeff * model_parallel_size
+    output_size_coeff = 17
+    output_size = output_size_coeff * model_parallel_size
+    batch_size = 7
+
+    # Network
+    identity_layer = IdentityLayer2D(batch_size, input_size).cuda()
+    linear_layer = mpu.RowParallelLinear(
+        input_size, output_size, keep_master_weight_for_test=True).cuda()
+    loss_weight = torch.randn([batch_size, output_size]).cuda()
+    # Forward
+    input_ = identity_layer()
+    output = linear_layer(input_)
+    loss = torch.mul(output, loss_weight).sum()
+    # Backward
+    loss.backward()
+
+    # Values.
+    dLdY = loss_weight
+    X = identity_layer.weight
+    A = linear_layer.master_weight.cuda()
+    dLdA = torch.matmul(dLdY.t(), X)
+    dLdb = torch.matmul(torch.ones(batch_size, 1).cuda().t(), dLdY).view(-1)
+    dLdX = torch.matmul(dLdY, A)
+
+    rank = mpu.get_model_parallel_rank()
+    my_dLdA = torch.split(
+        dLdA, input_size_coeff, dim=1)[rank].contiguous().clone()
+    error = my_dLdA.sub(linear_layer.weight.grad).abs().max()
+    torch.distributed.barrier()
+    print('   error in dLdA on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    error = dLdb.sub(linear_layer.bias.grad).abs().max()
+    torch.distributed.barrier()
+    print('   error in dLdb on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    error = dLdX.sub(identity_layer.weight.grad).abs().max()
+    torch.distributed.barrier()
+    print('   error in dLdX on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print(' >> passed the test :-)')
+
+
+class IdentityLayer3D(torch.nn.Module):
+
+    def __init__(self, m, n, k):
+        super(IdentityLayer3D, self).__init__()
+        self.weight = Parameter(torch.Tensor(m, n, k))
+        torch.nn.init.xavier_normal_(self.weight)
+
+    def forward(self):
+        return self.weight
+
+
+def parallel_self_attention(model_parallel_size, num_att_heads_per_partition,
+                            hidden_size_per_att_head, dropout_prob, batch_size,
+                            sequence_length):
+    mpu.initialize_model_parallel(model_parallel_size)
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    seed = 12345
+    set_random_seed(seed)
+
+    num_att_heads = num_att_heads_per_partition * torch.distributed.get_world_size(
+    )  # noqa
+    hidden_size = hidden_size_per_att_head * num_att_heads
+
+    # Network
+    identity_layer = IdentityLayer3D(batch_size, sequence_length,
+                                     hidden_size).cuda()
+    attention_layer = mpu.BertParallelSelfAttention(hidden_size, num_att_heads,
+                                                    dropout_prob).cuda()
+    loss_weight = torch.randn([batch_size, sequence_length,
+                               hidden_size]).cuda()
+    attention_mask = torch.randn([batch_size, 1, 1, sequence_length]).cuda()
+    # Forward
+    input_ = identity_layer()
+    output = attention_layer(input_, attention_mask)
+    loss = torch.mul(output, loss_weight).sum()
+    # Backward
+    loss.backward()
+
+    rank = mpu.get_model_parallel_rank()
+    mpu.destroy_model_parallel()
+    return rank, hidden_size, model_parallel_size, loss, \
+        attention_layer, identity_layer
+
+
+def test_parallel_self_attention(model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing ParallelSelfAttention with model parallel '
+              'size: {}'.format(model_parallel_size))
+
+    num_att_heads_per_partition = 3
+    hidden_size_per_att_head = 7
+    dropout_prob = 0.0  # has to be zero
+    batch_size = 5
+    sequence_length = 13
+
+    rank_1, hideen_size_1, model_parallel_size_1, loss_1, \
+        attention_layer_1, identity_layer_1 = parallel_self_attention(
+            1, num_att_heads_per_partition,
+            hidden_size_per_att_head, dropout_prob, batch_size, sequence_length)
+
+    rank, hidden_size, model_parallel_size, loss, \
+        attention_layer, identity_layer = parallel_self_attention(
+            model_parallel_size, num_att_heads_per_partition,
+            hidden_size_per_att_head, dropout_prob, batch_size, sequence_length)
+    assert hideen_size_1 == hidden_size
+
+    error = loss_1.sub(loss).abs().max()
+    torch.distributed.barrier()
+    print('   loss error on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 5.0e-6
+
+    my_lin_grad_list = torch.split(
+        attention_layer_1.query_key_value.weight.grad,
+        hidden_size // model_parallel_size, 0)[rank::model_parallel_size]
+    my_lin_grad = torch.cat(my_lin_grad_list, dim=0)
+    error = my_lin_grad.sub(
+        attention_layer.query_key_value.weight.grad).abs().max()
+    torch.distributed.barrier()
+    print('   weight gradient error on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 5.0e-6
+
+    error = identity_layer_1.weight.grad.sub(
+        identity_layer.weight.grad).abs().max()
+    torch.distributed.barrier()
+    print('   input gradient error on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 5.0e-6
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print(' >> passed the test :-)')
+
+
+def parallel_transformer(model_parallel_size, num_att_heads_per_partition,
+                         hidden_size_per_att_head, batch_size,
+                         sequence_length):
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    seed = 12345
+    set_random_seed(seed)
+
+    num_att_heads = num_att_heads_per_partition * torch.distributed.get_world_size(
+    )
+    hidden_size = hidden_size_per_att_head * num_att_heads
+    intermediate_size = 4 * hidden_size
+
+    # Network
+    identity_layer = IdentityLayer3D(batch_size, sequence_length,
+                                     hidden_size).cuda()
+    transformer_layer = mpu.BertParallelTransformerLayer(
+        hidden_size, intermediate_size, num_att_heads, 0.0, 0.0,
+        torch.nn.functional.relu, 1.0e-5).cuda()
+
+    loss_weight = torch.randn([batch_size, sequence_length,
+                               hidden_size]).cuda()
+    attention_mask = torch.randn([batch_size, 1, 1, sequence_length]).cuda()
+    # Forward
+    input_ = identity_layer()
+    output = transformer_layer(input_, attention_mask)
+    loss = torch.mul(output, loss_weight).sum()
+    # Backward
+    loss.backward()
+
+    rank = mpu.get_model_parallel_rank()
+    mpu.destroy_model_parallel()
+    return rank, hidden_size, model_parallel_size, loss, \
+        transformer_layer, identity_layer
+
+
+def test_parallel_transformer_layer(model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing ParallelTransformerLayer with model parallel '
+              'size: {}'.format(model_parallel_size))
+
+    num_att_heads_per_partition = 3
+    hidden_size_per_att_head = 7
+    batch_size = 5
+    sequence_length = 13
+
+    rank_1, hidden_size_1, model_parallel_size_1, loss_1, \
+        transformer_layer_1, identity_layer_1 = parallel_transformer(
+            1, num_att_heads_per_partition,
+            hidden_size_per_att_head, batch_size, sequence_length)
+
+    rank, hidden_size, model_parallel_size, loss, \
+        transformer_layer, identity_layer = parallel_transformer(
+            model_parallel_size, num_att_heads_per_partition,
+            hidden_size_per_att_head, batch_size, sequence_length)
+
+    error = loss_1.sub(loss).abs().max()
+    torch.distributed.barrier()
+    print('   loss error on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 5.0e-5, 'error: {}'.format(error)
+
+    error = identity_layer_1.weight.grad.sub(
+        identity_layer.weight.grad).abs().max()
+    torch.distributed.barrier()
+    print('   input gradient error on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 5.0e-5, 'error: {}'.format(error)
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print(' >> passed the test :-)')
+
+
+if __name__ == '__main__':
+
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+
+    initialize_distributed()
+    world_size = torch.distributed.get_world_size()
+
+    print_separator('test initialize affine weight')
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        test_initialize_affine_weight(model_parallel_size)
+        model_parallel_size *= 2
+
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        print_separator('test parallel embedding')
+        test_parallel_embedding(model_parallel_size)
+        model_parallel_size *= 2
+
+    print_separator('test column-parallel linear')
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        test_column_parallel_linear(model_parallel_size)
+        model_parallel_size *= 2
+
+    print_separator('test row-parallel linear')
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        test_row_parallel_linear(model_parallel_size)
+        model_parallel_size *= 2
+
+    print_separator('test parallel self-attention')
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        test_parallel_self_attention(model_parallel_size)
+        model_parallel_size *= 2
+
+    print_separator('test parallel transformer')
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        test_parallel_transformer_layer(model_parallel_size)
+        model_parallel_size *= 2
diff --git a/modelscope/models/nlp/mglm/mpu/tests/test_random.py b/modelscope/models/nlp/mglm/mpu/tests/test_random.py
new file mode 100644
index 00000000..55cc2351
--- /dev/null
+++ b/modelscope/models/nlp/mglm/mpu/tests/test_random.py
@@ -0,0 +1,206 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+
+import mpu
+import torch
+from commons import initialize_distributed, print_separator
+
+sys.path.append('../..')
+
+
+def test_set_cuda_rng_state(model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing set_rng_state with size {} ...'.format(
+            model_parallel_size))
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    size = 123
+    seed = 1234
+    torch.cuda.manual_seed(seed)
+    tensor = torch.cuda.FloatTensor(size)
+
+    # Get the state
+    rng_state = torch.cuda.get_rng_state()
+    rng_state_copy = rng_state.clone()
+
+    # Do some stuff.
+    for _ in range(5):
+        torch.randn(size, out=tensor)
+    result_1 = tensor.clone()
+
+    assert rng_state.sub(rng_state_copy).max() == 0
+    assert torch.cuda.get_rng_state().sub(rng_state_copy).max() > 0
+
+    # State should be different.
+    new_rng_state = torch.cuda.get_rng_state()
+    max_diff = new_rng_state.sub(rng_state).max()
+    print(
+        '   max diff in rng state (should be non-zero) on global rank {}: {}'.
+        format(torch.distributed.get_rank(), max_diff))
+    assert max_diff > 0
+
+    # Reset the rng state and do the same stuff.
+    mpu.random._set_cuda_rng_state(rng_state)
+    for _ in range(5):
+        torch.randn(size, out=tensor)
+    mpu.random._set_cuda_rng_state(rng_state)
+    for _ in range(5):
+        torch.randn(size, out=tensor)
+    result_2 = tensor.clone()
+
+    # Results should be the same
+    error = result_2.sub(result_1).abs().max()
+    print('   max error in generated tensors (should be zero) on '
+          'global rank {}: {}'.format(torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    # Input state should have remained intact.
+    error = rng_state.sub(rng_state_copy).max()
+    print('   max error in rng state (should be zero) on global rank {}: {}'.
+          format(torch.distributed.get_rank(), error))
+    assert error == 0
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+def test_cuda_rng_tracker(model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing cuda rng tracker with size {} ...'.format(
+            model_parallel_size))
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    seed_1 = 1234
+    seed_2 = 4321
+    size = [12, 21]
+    tensor = torch.cuda.FloatTensor(size)
+
+    # Set to seed_1 and generate two tensors.
+    torch.cuda.manual_seed(seed_1)
+    torch.randn(size, out=tensor)
+    target_11 = tensor.clone()
+    torch.randn(size, out=tensor)
+    target_12 = tensor.clone()
+
+    # Set to seed_2 and generate two tensors.
+    torch.cuda.manual_seed(seed_2)
+    torch.randn(size, out=tensor)
+    target_21 = tensor.clone()
+    torch.randn(size, out=tensor)
+    target_22 = tensor.clone()
+
+    # Now if we interleave seed_1 and seed_2,
+    # we should still get the same tensors
+    torch.cuda.manual_seed(seed_1)
+    mpu.get_cuda_rng_tracker().add('test', seed_2)
+
+    torch.randn(size, out=tensor)
+    result_11 = tensor.clone()
+
+    with mpu.get_cuda_rng_tracker().fork('test'):
+        torch.randn(size, out=tensor)
+        result_21 = tensor.clone()
+
+    torch.randn(size, out=tensor)
+    result_12 = tensor.clone()
+
+    with mpu.get_cuda_rng_tracker().fork('test'):
+        torch.randn(size, out=tensor)
+        result_22 = tensor.clone()
+
+    diff = result_11.sub(result_21).abs().max()
+    diff = min(diff, result_12.sub(result_22).abs().max())
+    print('   max diff in generated tensors (should be non-zero) on '
+          'global rank {}: {}'.format(torch.distributed.get_rank(), diff))
+    assert diff > 1.0e-6
+    error = max(
+        result_11.sub(target_11).abs().max(),
+        result_12.sub(target_12).abs().max())
+    error = max(error, result_21.sub(target_21).abs().max())
+    error = max(error, result_22.sub(target_22).abs().max())
+    print('   max error in generated tensors (should be zero) on '
+          'global rank {}: {}'.format(torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    # Reset the tracker
+    mpu.get_cuda_rng_tracker().reset()
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+def test_model_parallel_cuda_manual_seed(model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing model parallel cuda manual seed with size {} ...'.
+              format(model_parallel_size))
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    mpu.model_parallel_cuda_manual_seed(12345)
+    assert torch.cuda.initial_seed() == 12345
+    with mpu.get_cuda_rng_tracker().fork():
+        assert torch.cuda.initial_seed() == (12345 + 2718
+                                             + mpu.get_model_parallel_rank())
+
+    # Reset the tracker
+    mpu.get_cuda_rng_tracker().reset()
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+if __name__ == '__main__':
+
+    initialize_distributed()
+    world_size = torch.distributed.get_world_size()
+
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        print_separator('test set rng state')
+        test_set_cuda_rng_state(model_parallel_size)
+        model_parallel_size *= 2
+
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        print_separator('test cuda rng tracker')
+        test_cuda_rng_tracker(model_parallel_size)
+        model_parallel_size *= 2
+
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        print_separator('test model parallel cuda manual seed')
+        test_model_parallel_cuda_manual_seed(model_parallel_size)
+        model_parallel_size *= 2
diff --git a/modelscope/models/nlp/mglm/mpu/transformer.py b/modelscope/models/nlp/mglm/mpu/transformer.py
new file mode 100755
index 00000000..c12b2e10
--- /dev/null
+++ b/modelscope/models/nlp/mglm/mpu/transformer.py
@@ -0,0 +1,1200 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Transformer."""
+
+import math
+
+import deepspeed
+import torch
+import torch.nn.init as init
+from apex.normalization.fused_layer_norm import FusedLayerNorm as LayerNorm
+
+from .initialize import get_model_parallel_world_size
+from .layers import ColumnParallelLinear, RowParallelLinear
+from .mappings import gather_from_model_parallel_region
+from .random import checkpoint, get_cuda_rng_tracker
+from .utils import divide, split_tensor_along_last_dim
+
+
+class PositionalEmbedding(torch.nn.Module):
+
+    def __init__(self, hidden_size):
+        super(PositionalEmbedding, self).__init__()
+
+        self.hidden_size = hidden_size
+
+        inv_freq = 1 / (
+            10000**(torch.arange(0.0, hidden_size, 2.0) / hidden_size))  # noqa
+        self.register_buffer('inv_freq', inv_freq)
+
+    def forward(self, pos_seq, bsz=None):
+        sinusoid_inp = torch.ger(pos_seq, self.inv_freq)
+        pos_emb = torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=-1)
+
+        if bsz is not None:
+            return pos_emb[None, :, :].expand(bsz, -1, -1)
+        else:
+            return pos_emb[None, :, :]
+
+
+class ParallelCrossAttention(torch.nn.Module):
+    """Parallel cross-attention layer for Transformer"""
+
+    def __init__(self,
+                 hidden_size,
+                 num_attention_heads,
+                 attention_dropout_prob,
+                 output_dropout_prob,
+                 init_method,
+                 output_layer_init_method=None):
+        super(ParallelCrossAttention, self).__init__()
+        # Set output layer initialization if not provided.
+        if output_layer_init_method is None:
+            output_layer_init_method = init_method
+        # Per attention head and per partition values.
+        world_size = get_model_parallel_world_size()
+        self.hidden_size_per_partition = divide(hidden_size, world_size)
+        self.hidden_size_per_attention_head = divide(hidden_size,
+                                                     num_attention_heads)
+        self.num_attention_heads_per_partition = divide(
+            num_attention_heads, world_size)
+        # Strided linear layer.
+        self.query = ColumnParallelLinear(
+            hidden_size,
+            hidden_size,
+            gather_output=False,
+            init_method=init_method)
+        self.key_value = ColumnParallelLinear(
+            hidden_size,
+            2 * hidden_size,
+            stride=2,
+            gather_output=False,
+            init_method=init_method)
+        # Dropout. Note that for a single iteration, this layer will generate
+        # different outputs on different number of parallel partitions but
+        # on average it should not be partition dependent.
+        self.attention_dropout = torch.nn.Dropout(attention_dropout_prob)
+
+        # Output.
+        self.dense = RowParallelLinear(
+            hidden_size,
+            hidden_size,
+            input_is_parallel=True,
+            init_method=output_layer_init_method)
+        self.output_dropout = torch.nn.Dropout(output_dropout_prob)
+
+        if deepspeed.checkpointing.is_configured():
+            global get_cuda_rng_tracker, checkpoint
+            get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker
+            checkpoint = deepspeed.checkpointing.checkpoint
+
+    def _transpose_for_scores(self, tensor):
+        """Transpose a 3D tensor [b, s, np*hn] into a 4D tensor with
+        size [b, np, s, hn].
+        """
+        new_tensor_shape = tensor.size()[:-1] + \
+                           (self.num_attention_heads_per_partition, # noqa
+                            self.hidden_size_per_attention_head) # noqa
+        tensor = tensor.view(*new_tensor_shape)
+        return tensor.permute(0, 2, 1, 3)
+
+    def forward(self, hidden_states, encoder_states, cross_mask):
+        # hidden_states: [b, s, h]
+        # ltor_mask: [1, 1, s, s]
+
+        # Attention heads. [b, s, hp]
+        mixed_query_layer = self.query(hidden_states)
+        mixed_x_layer = self.key_value(encoder_states)
+        (mixed_key_layer,
+         mixed_value_layer) = split_tensor_along_last_dim(mixed_x_layer, 2)
+
+        # Reshape and transpose [b, np, s, hn]
+        query_layer = self._transpose_for_scores(mixed_query_layer)
+        key_layer = self._transpose_for_scores(mixed_key_layer)
+        value_layer = self._transpose_for_scores(mixed_value_layer)
+        # Raw attention scores. [b, np, s, s]
+        attention_scores = torch.matmul(query_layer,
+                                        key_layer.transpose(-1, -2))
+        attention_scores = attention_scores / math.sqrt(
+            self.hidden_size_per_attention_head)
+        if cross_mask is not None:
+            # Apply the left to right attention mask.
+            attention_scores = torch.mul(attention_scores, cross_mask) - \
+                               10000.0 * (1.0 - cross_mask) # noqa
+
+        # Attention probabilities. [b, np, s, s]
+        attention_probs = torch.nn.Softmax(dim=-1)(attention_scores)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        with get_cuda_rng_tracker().fork():
+            attention_probs = self.attention_dropout(attention_probs)
+
+        # Context layer.
+        # [b, np, s, hn]
+        context_layer = torch.matmul(attention_probs, value_layer)
+        # [b, s, np, hn]
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + \
+                                  (self.hidden_size_per_partition,) # noqa
+        # [b, s, hp]
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        # Output. [b, s, h]
+        output = self.dense(context_layer)
+        output = self.output_dropout(output)
+
+        return output
+
+
+class ParallelSelfAttention(torch.nn.Module):
+    """Parallel self-attention layer for GPT2.
+
+    Self-attention layer takes input with size [b, s, h] where b is
+    the batch size, s is the sequence lenght, and h is the hidden size
+    and creates output of the same size.
+    Arguments:
+        hidden_size: total hidden size of the layer (h).
+        num_attention_heads: number of attention heads (n). Note that we
+                             require n to be divisible by number of GPUs
+                             used to parallelize the model. Also, we
+                             require hidden size to be divisible by n.
+        attention_dropout_prob: dropout probability for the attention scores.
+        init_method: weight initialization.
+        output_layer_init_method: output layer initialization. If None, use
+                                  `init_method`.
+    We use the following notation:
+        h: hidden_size
+        n: num_attention_heads
+        p: number of partitions
+        np: n/p
+        hp: h/p
+        hn: h/n
+        b: batch size
+        s: sequence length
+    """
+
+    def __init__(self,
+                 hidden_size,
+                 num_attention_heads,
+                 attention_dropout_prob,
+                 output_dropout_prob,
+                 init_method,
+                 output_layer_init_method=None,
+                 relative_encoding=False,
+                 performer=False,
+                 attention_scale=1.0):
+        super(ParallelSelfAttention, self).__init__()
+        self.performer = performer
+        # Set output layer initialization if not provided.
+        if output_layer_init_method is None:
+            output_layer_init_method = init_method
+        # Per attention head and per partition values.
+        world_size = get_model_parallel_world_size()
+        self.hidden_size_per_partition = divide(hidden_size, world_size)
+        self.hidden_size_per_attention_head = divide(hidden_size,
+                                                     num_attention_heads)
+        self.num_attention_heads_per_partition = divide(
+            num_attention_heads, world_size)
+        self.relative_encoding = relative_encoding
+        self.attention_scale = attention_scale
+        # Strided linear layer.
+        self.query_key_value = ColumnParallelLinear(
+            hidden_size,
+            3 * hidden_size,
+            stride=3,
+            gather_output=False,
+            init_method=init_method)
+        if relative_encoding:
+            self.relative = ColumnParallelLinear(
+                hidden_size,
+                hidden_size,
+                gather_output=False,
+                init_method=init_method)
+        # Dropout. Note that for a single iteration, this layer will generate
+        # different outputs on different number of parallel partitions but
+        # on average it should not be partition dependent.
+        self.attention_dropout = torch.nn.Dropout(attention_dropout_prob)
+
+        # Output.
+        self.dense = RowParallelLinear(
+            hidden_size,
+            hidden_size,
+            input_is_parallel=True,
+            init_method=output_layer_init_method)
+        self.output_dropout = torch.nn.Dropout(output_dropout_prob)
+
+        if deepspeed.checkpointing.is_configured():
+            global get_cuda_rng_tracker, checkpoint
+            get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker
+            checkpoint = deepspeed.checkpointing.checkpoint
+
+    def _transpose_for_scores(self, tensor):
+        """Transpose a 3D tensor [b, s, np*hn] into a 4D tensor with
+        size [b, np, s, hn].
+        """
+        new_tensor_shape = tensor.size()[:-1] + \
+                           (self.num_attention_heads_per_partition, # noqa
+                            self.hidden_size_per_attention_head) # noqa
+        tensor = tensor.view(*new_tensor_shape)
+        return tensor.permute(0, 2, 1, 3)
+
+    @staticmethod
+    def _rel_shift(x, zero_triu=False):
+        # ql x kl x bsz x h
+        # bsz x h x ql x kl
+        zero_pad = torch.zeros((*x.size()[:-2], x.size(-2), 1),
+                               device=x.device,
+                               dtype=x.dtype)
+        x_padded = torch.cat([zero_pad, x], dim=-1)
+
+        x_padded = x_padded.view(*x.size()[:-2], x.size(-1) + 1, x.size(-2))
+
+        x = x_padded[:, :, 1:].view_as(x)
+
+        if zero_triu:
+            ones = torch.ones((x.size(0), x.size(1)))
+            x = x * torch.tril(ones, x.size(1) - x.size(0))[:, :, None, None]
+
+        return x
+
+    def forward(self,
+                hidden_states,
+                ltor_mask,
+                position_embeddings=None,
+                r_w_bias=None,
+                r_r_bias=None,
+                mem=None):
+        # hidden_states: [b, s, h]
+        # ltor_mask: [1, 1, s, s]
+
+        # Attention heads. [b, s, hp]
+        query_length = hidden_states.size(1)
+
+        if mem is None:
+            mixed_x_layer = self.query_key_value(hidden_states)
+            (mixed_query_layer, mixed_key_layer,
+             mixed_value_layer) = split_tensor_along_last_dim(
+                 mixed_x_layer, 3)
+        else:
+            cat = torch.cat((mem, hidden_states), 1)
+            mixed_x_layer = self.query_key_value(cat)
+            (mixed_query_layer, mixed_key_layer,
+             mixed_value_layer) = split_tensor_along_last_dim(
+                 mixed_x_layer, 3)
+            mixed_query_layer = mixed_query_layer[:, -query_length:]
+
+        # Reshape and transpose [b, np, s, hn]
+        query_layer = self._transpose_for_scores(mixed_query_layer)
+        key_layer = self._transpose_for_scores(mixed_key_layer)
+        value_layer = self._transpose_for_scores(mixed_value_layer)
+        if self.relative_encoding:
+            relative_layer = self.relative(position_embeddings)
+            relative_layer = self._transpose_for_scores(
+                relative_layer)  # 1 (bsz) x n_head x klen x d_head
+            # Raw attention scores. [b, np, qs, ks]
+            rw_head_q = query_layer + r_w_bias.unsqueeze(1)
+            ac_score = torch.matmul(rw_head_q, key_layer.transpose(-1, -2))
+            rr_head_q = query_layer + r_r_bias.unsqueeze(1)
+            bd_score = torch.matmul(rr_head_q,
+                                    relative_layer.transpose(-1, -2))
+            bd_score = self._rel_shift(bd_score)  # qlen x klen x bsz x n_head
+            # bd_score = bd_score.permute(2, 3, 0, 1) # bsz n_head qlen klen
+
+            attention_scores = ac_score + bd_score
+            attention_scores = attention_scores / math.sqrt(
+                self.hidden_size_per_attention_head)
+        else:
+            if self.attention_scale > 1.0:
+                # Raw attention scores. [b, np, s, s]
+                attention_scores = torch.matmul(
+                    query_layer / math.sqrt(self.attention_scale),
+                    key_layer.transpose(-1, -2)
+                    / math.sqrt(self.hidden_size_per_attention_head
+                                * self.attention_scale))
+            else:
+                attention_scores = torch.matmul(
+                    query_layer,
+                    key_layer.transpose(-1, -2)
+                    / math.sqrt(self.hidden_size_per_attention_head))
+
+        # Apply the left to right attention mask.
+        attention_scores = torch.mul(attention_scores, ltor_mask)
+        if self.attention_scale > 1.0:
+            max_attention_scores = attention_scores.max(
+                dim=-1, keepdim=True)[0]
+            attention_scores -= max_attention_scores
+            attention_scores *= self.attention_scale
+        # if torch.distributed.get_rank() == 0:
+        #     print(min_attention_scores, attention_scores.max().item())
+        attention_scores = attention_scores + (-65504.0) * (1.0 - ltor_mask)
+        # Attention probabilities. [b, np, s, s]
+        attention_probs = torch.nn.Softmax(dim=-1)(attention_scores)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        with get_cuda_rng_tracker().fork():
+            attention_probs = self.attention_dropout(attention_probs)
+
+        # Context layer.
+        # [b, np, s, hn]
+        context_layer = torch.matmul(attention_probs, value_layer)
+        # [b, s, np, hn]
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + \
+                                  (self.hidden_size_per_partition,) # noqa
+        # [b, s, hp]
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        # Output. [b, s, h]
+        output = self.dense(context_layer)
+        output = self.output_dropout(output)
+
+        return output
+
+
+@torch.jit.script
+def gelu_impl(x):
+    """OpenAI's gelu implementation."""
+    return 0.5 * x * (
+        1.0 + torch.tanh(0.7978845608028654 * x *  # noqa
+                         (1.0 + 0.044715 * x * x)))  # noqa
+
+
+def gelu(x):
+    return gelu_impl(x)
+
+
+class ParallelMLP(torch.nn.Module):
+    """MLP for GPT2.
+
+    MLP will take the input with h hidden state, project it to 4*h
+    hidden dimension, perform gelu transformation, and project the
+    state back into h hidden dimension. At the end, dropout is also
+    applied.
+
+    Arguments:
+        hidden_size: The hidden size of the self attention.
+        output_dropout_prob: dropout probability for the outputs
+                             after self attention and final output.
+        init_method: initialization method used for the weights. Note
+                     that all biases are initialized to zero and
+                     layernorm weight are initialized to one.
+        output_layer_init_method: output layer initialization. If None,
+                                  use `init_method`.
+    """
+
+    def __init__(self,
+                 hidden_size,
+                 output_dropout_prob,
+                 init_method,
+                 output_layer_init_method=None):
+        super(ParallelMLP, self).__init__()
+        # Set output layer initialization if not provided.
+        if output_layer_init_method is None:
+            output_layer_init_method = init_method
+        # Project to 4h.
+        self.dense_h_to_4h = ColumnParallelLinear(
+            hidden_size,
+            4 * hidden_size,
+            gather_output=False,
+            init_method=init_method)
+        # Project back to h.
+        self.dense_4h_to_h = RowParallelLinear(
+            4 * hidden_size,
+            hidden_size,
+            input_is_parallel=True,
+            init_method=output_layer_init_method)
+        self.dropout = torch.nn.Dropout(output_dropout_prob)
+
+    def forward(self, hidden_states):
+        # [b, s, 4hp]
+        intermediate_parallel = self.dense_h_to_4h(hidden_states)
+        intermediate_parallel = gelu(intermediate_parallel)
+
+        # [b, s, h]
+        output = self.dense_4h_to_h(intermediate_parallel)
+        output = self.dropout(output)
+        return output
+
+
+class ParallelDecoderLayer(torch.nn.Module):
+    """A single layer transformer for GPT2.
+
+    We use the following notation:
+        h: hidden size
+        n: number of attention heads
+        b: batch size
+        s: sequence length
+    Transformore layer takes input with size [b, s, h] and returns an
+    output of the same size.
+
+    Arguments:
+        hidden_size: The hidden size of the self attention.
+        num_attention_heads: number of attention head in the self
+                             attention.
+        attention_dropout_prob: dropout probability of the attention
+                                score in self attention.
+        output_dropout_prob: dropout probability for the outputs
+                             after self attention and final output.
+        layernorm_epsilon: epsilon used in layernorm to avoid
+                           division by zero.
+        init_method: initialization method used for the weights. Note
+                     that all biases are initialized to zero and
+                     layernorm weight are initialized to one.
+        output_layer_init_method: output layers (attention output and
+                                  mlp output) initialization. If None,
+                                  use `init_method`.
+    """
+
+    def __init__(self,
+                 hidden_size,
+                 num_attention_heads,
+                 attention_dropout_prob,
+                 output_dropout_prob,
+                 layernorm_epsilon,
+                 init_method,
+                 output_layer_init_method=None):
+        super(ParallelDecoderLayer, self).__init__()
+        # Set output layer initialization if not provided.
+        if output_layer_init_method is None:
+            output_layer_init_method = init_method
+
+        # Layernorm on the input data.
+        self.input_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon)
+
+        # Self attention.
+        self.self_attention = ParallelSelfAttention(
+            hidden_size,
+            num_attention_heads,
+            attention_dropout_prob,
+            output_dropout_prob,
+            init_method,
+            output_layer_init_method=output_layer_init_method)
+
+        # Layernorm after the self attention.
+        self.post_self_layernorm = LayerNorm(
+            hidden_size, eps=layernorm_epsilon)
+
+        self.cross_attention = ParallelCrossAttention(
+            hidden_size,
+            num_attention_heads,
+            attention_dropout_prob,
+            output_dropout_prob,
+            init_method,
+            output_layer_init_method=output_layer_init_method)
+
+        # Layernorm after the cross attention.
+        self.post_attention_layernorm = LayerNorm(
+            hidden_size, eps=layernorm_epsilon)
+
+        # MLP
+        self.mlp = ParallelMLP(
+            hidden_size,
+            output_dropout_prob,
+            init_method,
+            output_layer_init_method=output_layer_init_method)
+
+    def forward(self,
+                hidden_states,
+                encoder_states,
+                ltor_mask,
+                cross_mask=None):
+        # hidden_states: [b, s, h]
+        # ltor_mask: [1, 1, s, s]
+
+        # Layer norm at the begining of the transformer layer.
+        layernorm_output = self.input_layernorm(hidden_states)
+        # Self attention.
+        self_attention_output = self.self_attention(layernorm_output,
+                                                    ltor_mask)
+        # Residual connection.
+        self_layernorm_input = hidden_states + self_attention_output
+        # Layer norm post the self attention.
+        self_layernorm_output = self.post_self_layernorm(self_layernorm_input)
+        # Cross attention
+        attention_output = self.cross_attention(self_layernorm_output,
+                                                encoder_states, cross_mask)
+        # Residual connection
+        layernorm_input = self_layernorm_input + attention_output
+        # Layer norm post the cross attention
+        layernorm_output = self.post_attention_layernorm(layernorm_input)
+        # MLP.
+        mlp_output = self.mlp(layernorm_output)
+        # Second residual connection.
+        output = layernorm_input + mlp_output
+        return output
+
+
+class ParallelTransformerLayer(torch.nn.Module):
+    """A single layer transformer for GPT2.
+
+    We use the following notation:
+        h: hidden size
+        n: number of attention heads
+        b: batch size
+        s: sequence length
+    Transformore layer takes input with size [b, s, h] and returns an
+    output of the same size.
+
+    Arguments:
+        hidden_size: The hidden size of the self attention.
+        num_attention_heads: number of attention head in the self
+                             attention.
+        attention_dropout_prob: dropout probability of the attention
+                                score in self attention.
+        output_dropout_prob: dropout probability for the outputs
+                             after self attention and final output.
+        layernorm_epsilon: epsilon used in layernorm to avoid
+                           division by zero.
+        init_method: initialization method used for the weights. Note
+                     that all biases are initialized to zero and
+                     layernorm weight are initialized to one.
+        output_layer_init_method: output layers (attention output and
+                                  mlp output) initialization. If None,
+                                  use `init_method`.
+    """
+
+    def __init__(self,
+                 hidden_size,
+                 num_attention_heads,
+                 attention_dropout_prob,
+                 output_dropout_prob,
+                 layernorm_epsilon,
+                 init_method,
+                 output_layer_init_method=None,
+                 relative_encoding=False,
+                 performer=False,
+                 attention_scale=1.0):
+        super(ParallelTransformerLayer, self).__init__()
+        # Set output layer initialization if not provided.
+        if output_layer_init_method is None:
+            output_layer_init_method = init_method
+
+        # Layernorm on the input data.
+        self.input_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon)
+
+        # Self attention.
+        self.attention = ParallelSelfAttention(
+            hidden_size,
+            num_attention_heads,
+            attention_dropout_prob,
+            output_dropout_prob,
+            init_method,
+            output_layer_init_method=output_layer_init_method,
+            relative_encoding=relative_encoding,
+            performer=performer,
+            attention_scale=attention_scale)
+
+        # Layernorm on the input data.
+        self.post_attention_layernorm = LayerNorm(
+            hidden_size, eps=layernorm_epsilon)
+
+        # MLP
+        self.mlp = ParallelMLP(
+            hidden_size,
+            output_dropout_prob,
+            init_method,
+            output_layer_init_method=output_layer_init_method)
+
+    def forward(self,
+                hidden_states,
+                ltor_mask,
+                position_embeddings=None,
+                r_w_bias=None,
+                r_r_bias=None,
+                mem=None):
+        # hidden_states: [b, s, h]
+        # ltor_mask: [1, 1, s, s]
+
+        # Layer norm at the begining of the transformer layer.
+        layernorm_output = self.input_layernorm(hidden_states)
+        mem = self.input_layernorm(mem) if mem is not None else None
+        # Self attention.
+        attention_output = self.attention(layernorm_output, ltor_mask,
+                                          position_embeddings, r_w_bias,
+                                          r_r_bias, mem)
+        # Residual connection.
+        layernorm_input = hidden_states + attention_output
+        # Layer norm post the self attention.
+        layernorm_output = self.post_attention_layernorm(layernorm_input)
+        # MLP.
+        mlp_output = self.mlp(layernorm_output)
+        # Second residual connection.
+        output = layernorm_input + mlp_output
+
+        return output
+
+
+def unscaled_init_method(sigma):
+    """Init method based on N(0, sigma)."""
+
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)
+
+    return init_
+
+
+def scaled_init_method(sigma, num_layers):
+    """Init method based on N(0, sigma/sqrt(2*num_layers)."""
+    std = sigma / math.sqrt(2.0 * num_layers)
+
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=0.0, std=std)
+
+    return init_
+
+
+class GPT2ParallelTransformer(torch.nn.Module):
+    """GPT-2 transformer.
+
+    This module takes input from embedding layer and it's output can
+    be used directly by a logit layer. It consists of L (num-layers)
+    blocks of:
+        layer norm
+        self attention
+        residual connection
+        layer norm
+        mlp
+        residual connection
+    followed by a final layer norm.
+
+    Arguments:
+        num_layers: Number of transformer layers.
+        hidden_size: The hidden size of the self attention.
+        num_attention_heads: number of attention head in the self
+                             attention.
+        attention_dropout_prob: dropout probability of the attention
+                                score in self attention.
+        output_dropout_prob: dropout probability for the outputs
+                             after self attention and final output.
+        checkpoint_activations: if True, checkpoint activations.
+        checkpoint_num_layers: number of layers to checkpoint. This
+                               is basically the chunk size in checkpoitning.
+        layernorm_epsilon: epsilon used in layernorm to avoid
+                           division by zero.
+        init_method_std: standard deviation of the init method which has
+                         the form N(0, std).
+        use_scaled_init_for_output_weights: If Ture use 1/sqrt(2*num_layers)
+                                            scaling for the output weights (
+                                            output of self attention and mlp).
+    """
+
+    def __init__(
+        self,
+        num_layers,
+        hidden_size,
+        num_attention_heads,
+        max_sequence_length,
+        max_memory_length,
+        embedding_dropout_prob,
+        attention_dropout_prob,
+        output_dropout_prob,
+        checkpoint_activations,
+        checkpoint_num_layers=1,
+        layernorm_epsilon=1.0e-5,
+        init_method_std=0.02,
+        use_scaled_init_for_output_weights=True,
+        relative_encoding=False,
+        block_position_encoding=False,
+        performer=False,
+        use_decoder_layer=False,
+        attention_scale=1.0,
+    ):
+        super(GPT2ParallelTransformer, self).__init__()
+        self.hidden_size = hidden_size
+        # Store activation checkpoiting flag.
+        self.checkpoint_activations = checkpoint_activations
+        self.checkpoint_num_layers = checkpoint_num_layers
+        self.max_memory_length = max_memory_length
+        self.performer = performer
+        self.use_decoder_layer = use_decoder_layer
+        assert not (performer and relative_encoding)
+
+        output_layer_init_method = None
+        if use_scaled_init_for_output_weights:
+            output_layer_init_method = scaled_init_method(
+                init_method_std, num_layers)
+        # Embeddings dropout
+        self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob)
+        self.relative_encoding = relative_encoding
+        self.block_position_encoding = block_position_encoding
+        if relative_encoding:
+            # Relative position embedding
+            self.position_embeddings = PositionalEmbedding(hidden_size)
+            # Per attention head and per partition values.
+            world_size = get_model_parallel_world_size()
+            self.hidden_size_per_attention_head = divide(
+                hidden_size, num_attention_heads)
+            self.num_attention_heads_per_partition = divide(
+                num_attention_heads, world_size)
+            self.r_w_bias = torch.nn.Parameter(
+                torch.Tensor(self.num_attention_heads_per_partition,
+                             self.hidden_size_per_attention_head))
+            self.r_w_bias.model_parallel = True
+            self.r_r_bias = torch.nn.Parameter(
+                torch.Tensor(self.num_attention_heads_per_partition,
+                             self.hidden_size_per_attention_head))
+            self.r_r_bias.model_parallel = True
+            # Always initialize bias to zero.
+            with torch.no_grad():
+                self.r_w_bias.zero_()
+                self.r_r_bias.zero_()
+        else:
+            # Position embedding (serial).
+            if block_position_encoding:
+                self.position_embeddings = torch.nn.Embedding(
+                    max_sequence_length + 1, hidden_size)
+                self.block_position_embeddings = torch.nn.Embedding(
+                    max_sequence_length + 1, hidden_size)
+                torch.nn.init.normal_(
+                    self.block_position_embeddings.weight,
+                    mean=0.0,
+                    std=init_method_std)
+            else:
+                self.position_embeddings = torch.nn.Embedding(
+                    max_sequence_length, hidden_size)
+            # Initialize the position embeddings.
+            torch.nn.init.normal_(
+                self.position_embeddings.weight, mean=0.0, std=init_method_std)
+
+        def get_layer():
+            if use_decoder_layer:
+                return ParallelDecoderLayer(
+                    hidden_size,
+                    num_attention_heads,
+                    attention_dropout_prob,
+                    output_dropout_prob,
+                    layernorm_epsilon,
+                    unscaled_init_method(init_method_std),
+                    output_layer_init_method=output_layer_init_method)
+            else:
+                return ParallelTransformerLayer(
+                    hidden_size,
+                    num_attention_heads,
+                    attention_dropout_prob,
+                    output_dropout_prob,
+                    layernorm_epsilon,
+                    unscaled_init_method(init_method_std),
+                    output_layer_init_method=output_layer_init_method,
+                    relative_encoding=relative_encoding,
+                    performer=performer,
+                    attention_scale=attention_scale)
+
+        # Transformer layers.
+        self.layers = torch.nn.ModuleList(
+            [get_layer() for _ in range(num_layers)])
+
+        # Final layer norm before output.
+        self.final_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon)
+
+        if deepspeed.checkpointing.is_configured():
+            global get_cuda_rng_tracker, checkpoint
+            get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker
+            checkpoint = deepspeed.checkpointing.checkpoint
+
+    def forward(self,
+                hidden_states,
+                position_ids,
+                attention_mask,
+                memory_states=None,
+                encoder_states=None,
+                return_memory=False,
+                detach_memory=True):
+        batch_size, query_length = hidden_states.size()[:2]
+        memory_length = memory_states[0].size(1) if memory_states else 0
+        key_length = query_length + memory_length
+        # attention mask is the beginning postion of B region, \in [0, query_len)
+        is_scalar = torch.numel(attention_mask) == 1
+        is_sep = is_scalar or torch.numel(attention_mask) == batch_size
+        if self.performer:
+            assert is_scalar, 'attention_mask should be a scalar to indicate the seperation position.'
+            assert memory_length == 0, 'Do not support transformer-xl.'
+        if is_sep:
+            sep = attention_mask.item() if is_scalar else attention_mask
+
+            # conventional transformer
+            def build_mask_matrix(seq_length, sep, memory_length=0):
+                m = hidden_states.new_ones((1, seq_length, seq_length))
+                m = torch.tril(m)
+                if is_scalar:
+                    m[0, :, :sep] = 1
+                else:
+                    m = m.expand(batch_size, -1, -1)
+                    ids = torch.arange(
+                        seq_length, device=sep.device,
+                        dtype=sep.dtype).view(1, -1)
+                    mask = ids < sep.view(-1, 1)
+                    m = m.masked_fill(mask.unsqueeze(1).expand_as(m), 1)
+                if memory_length > 0:
+                    m = m.expand(batch_size, -1, -1)
+                    m = torch.cat(
+                        (hidden_states.new_ones((batch_size, seq_length,
+                                                 memory_length)), m),  # noqa
+                        dim=2)  # noqa
+                m = m.unsqueeze(1)
+                return m
+
+            if not self.performer:
+                attention_mask = build_mask_matrix(
+                    query_length, sep, memory_length=memory_length)
+        else:
+            attention_mask = attention_mask[:, :, :,
+                                            -query_length - memory_length:]
+
+        if self.relative_encoding:
+            position_sequence = torch.arange(
+                key_length - 1,
+                -1,
+                -1.0,
+                device=hidden_states.device,
+                dtype=hidden_states.dtype)
+            position_embeddings = self.position_embeddings(position_sequence)
+            # Apply dropout
+            position_embeddings = self.embedding_dropout(position_embeddings)
+        else:
+            if self.block_position_encoding:
+                position_ids, block_position_ids = position_ids[:,
+                                                                0], position_ids[:,
+                                                                                 1]
+            position_embeddings = self.position_embeddings(position_ids)
+            hidden_states = hidden_states + position_embeddings
+            if self.block_position_encoding:
+                block_position_embeddings = self.block_position_embeddings(
+                    block_position_ids)
+                hidden_states = hidden_states + block_position_embeddings
+        hidden_states = self.embedding_dropout(hidden_states)
+
+        def check_detach(_hidden_states):
+            if detach_memory:
+                return _hidden_states.detach()
+            return _hidden_states
+
+        if self.max_memory_length > 0 or return_memory:
+            mem_layers = [check_detach(hidden_states)]
+        else:
+            mem_layers = []
+
+        def custom(start, end):
+
+            def custom_forward(*inputs):
+                layers_ = self.layers[start:end]
+                x_, inputs = inputs[0], inputs[1:]
+                if self.relative_encoding:
+                    inputs, mems_ = inputs[:4], inputs[4:]
+                else:
+                    inputs, mems_ = inputs[:1], inputs[1:]
+                for i, layer in enumerate(layers_):
+                    mem_i_ = mems_[i] if mems_ else None
+                    x_ = layer(x_, *inputs, mem=mem_i_)
+                    if self.max_memory_length > 0 or return_memory:
+                        mem_layers.append(check_detach(x_))
+                return x_
+
+            return custom_forward
+
+        if self.checkpoint_activations:
+            l = 0  # noqa
+            num_layers = len(self.layers)
+            chunk_length = self.checkpoint_num_layers
+            while l < num_layers:
+                args = [hidden_states, attention_mask
+                        ] if not self.use_decoder_layer else [
+                            hidden_states,
+                            encoder_states,
+                            attention_mask  # noqa
+                        ]  # noqa
+                if self.relative_encoding:
+                    args += [position_embeddings, self.r_w_bias, self.r_r_bias]
+                if memory_states:
+                    args += memory_states[l:l + chunk_length]
+                hidden_states = checkpoint(custom(l, l + chunk_length), *args)
+                l += chunk_length  # noqa
+        else:
+            for i, layer in enumerate(self.layers):
+                args = [hidden_states, attention_mask
+                        ] if not self.use_decoder_layer else [
+                            hidden_states,
+                            encoder_states,
+                            attention_mask  # noqa
+                        ]  # noqa
+                if self.relative_encoding:
+                    args += [position_embeddings, self.r_w_bias, self.r_r_bias]
+                mem_i = memory_states[i] if memory_states else None
+                hidden_states = layer(*args, mem=mem_i)
+                if self.max_memory_length > 0 or return_memory:
+                    mem_layers.append(check_detach(hidden_states))
+
+        # Final layer norm.
+        output = self.final_layernorm(hidden_states)
+        if self.max_memory_length > 0 or return_memory:
+            mem_layers = self.update_mems(
+                mem_layers, memory_states, return_memory=return_memory)
+
+        return (output, mem_layers)
+
+    def update_mems(self, hiddens, mems, return_memory=False):
+        memory_length = mems[0].size(1) if mems else 0
+        query_length = hiddens[0].size(1)
+        new_memory_length = memory_length + query_length
+        if not return_memory:
+            new_memory_length = min(self.max_memory_length, new_memory_length)
+        new_mems = []
+        # with torch.no_grad():
+        for i in range(len(hiddens)):
+            if new_memory_length <= query_length:
+                new_mems.append(hiddens[i][:, -new_memory_length:])
+            else:
+                new_mems.append(
+                    torch.cat((mems[i][:, -new_memory_length + query_length:],
+                               hiddens[i]),
+                              dim=1))
+        return new_mems
+
+
+class BertParallelSelfAttention(torch.nn.Module):
+    """Parallel self-attention layer for BERT.
+
+    Self-attention layer takes input with size [b, s, h] where b is
+    the batch size, s is the sequence lenght, and h is the hidden size
+    and creates output of the same size.
+    Arguments:
+        hidden_size: total hidden size of the layer (h).
+        num_attention_heads: number of attention heads (n). Note that we
+                             require n to be divisible by number of GPUs
+                             used to parallelize the model. Also, we
+                             require hidden size be divisible by n.
+        dropout_prob: dropout probability for the attention scores.
+        output_parallel: If true, no all-gather is done on the output and
+                         the output values will be per partition.
+    We use the following notation:
+        h: hidden_size
+        n: num_attention_heads
+        p: number of partitions
+        np: n/p
+        hp: h/p
+        hn: h/n
+        b: batch size
+        s: sequence length
+    """
+
+    def __init__(self,
+                 hidden_size,
+                 num_attention_heads,
+                 dropout_prob,
+                 output_parallel=False,
+                 init_method=init.xavier_normal_):
+        super(BertParallelSelfAttention, self).__init__()
+        # Input configuration.
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+        self.dropout_prob = dropout_prob
+        self.output_parallel = output_parallel
+        # Per attention head and per partition values.
+        world_size = get_model_parallel_world_size()
+        self.hidden_size_per_partition = divide(hidden_size, world_size)
+        self.hidden_size_per_attention_head = divide(hidden_size,
+                                                     num_attention_heads)
+        self.num_attention_heads_per_partition = divide(
+            num_attention_heads, world_size)
+        # Strided linear layer.
+        self.query_key_value = ColumnParallelLinear(
+            hidden_size,
+            3 * hidden_size,
+            stride=3,
+            gather_output=False,
+            init_method=init_method)
+        # Dropout. Note that for a single iteration, this layer will generate
+        # different outputs on different number of parallel partitions but
+        # on average it should not be partition dependent.
+        self.dropout = torch.nn.Dropout(dropout_prob)
+
+        if deepspeed.checkpointing.is_configured():
+            global get_cuda_rng_tracker, checkpoint
+            get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker
+            checkpoint = deepspeed.checkpointing.checkpoint
+
+    def _transpose_for_scores(self, tensor):
+        """Transpose a 3D tensor [b, s, np*hn] into a 4D tensor with
+        size [b, np, s, hn].
+        """
+        new_tensor_shape = tensor.size()[:-1] + \
+                           (self.num_attention_heads_per_partition, # noqa
+                            self.hidden_size_per_attention_head) # noqa
+        tensor = tensor.view(*new_tensor_shape)
+        return tensor.permute(0, 2, 1, 3)
+
+    def forward(self, hidden_states, attention_mask):
+
+        # Attention heads. [b, s, hp]
+        mixed_x_layer = self.query_key_value(hidden_states)
+        (mixed_query_layer, mixed_key_layer,
+         mixed_value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3)
+
+        # Reshape and transpose [b, np, s, hn]
+        query_layer = self._transpose_for_scores(mixed_query_layer)
+        key_layer = self._transpose_for_scores(mixed_key_layer)
+        value_layer = self._transpose_for_scores(mixed_value_layer)
+
+        # Raw attention scores. [b, np, s, s]
+        norm_factor = math.sqrt(math.sqrt(self.hidden_size_per_attention_head))
+        attention_scores = torch.matmul(
+            query_layer / norm_factor,
+            key_layer.transpose(-1, -2) / norm_factor)
+        # Apply the attention mask.
+        attention_scores += attention_mask
+
+        # Attention probabilities. [b, np, s, s]
+        attention_probs = torch.nn.Softmax(dim=-1)(attention_scores)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        with get_cuda_rng_tracker().fork():
+            attention_probs = self.dropout(attention_probs)
+
+        # Context layer.
+        # [b, np, s, hn]
+        context_layer = torch.matmul(attention_probs, value_layer)
+        # [b, s, np, hn]
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (
+            self.hidden_size_per_partition, )  # noqa
+        # [b, s, hp]
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        # Output. [b, s, h]
+        if self.output_parallel:
+            output = context_layer
+        else:
+            output = gather_from_model_parallel_region(context_layer)
+
+        return output
+
+
+class BertParallelTransformerOutput(torch.nn.Module):
+    """The output layer used after self attention and intermediate
+    parts of transformer layer."""
+
+    def __init__(self,
+                 input_size,
+                 output_size,
+                 dropout_prob,
+                 layernorm_epsilon=1.0e-12,
+                 input_is_parallel=False,
+                 init_method=init.xavier_normal_):
+        super(BertParallelTransformerOutput, self).__init__()
+        # Components.
+        self.dense = RowParallelLinear(
+            input_size,
+            output_size,
+            input_is_parallel=input_is_parallel,
+            init_method=init_method)
+        self.dropout = torch.nn.Dropout(dropout_prob)
+        self.layernorm = LayerNorm(output_size, eps=layernorm_epsilon)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        layernorm_input = hidden_states + input_tensor
+        hidden_states = self.layernorm(layernorm_input)
+        return hidden_states
+
+
+class BertParallelTransformerLayer(torch.nn.Module):
+    """A single layer transformer for Bert.
+
+    We use the following notation:
+        h: hidden size
+        n: number of attention heads
+        b: batch size
+        s: sequence length
+    Transformore layer takes input with size [b, s, h] and returns an
+    output of the same size.
+
+    Arguments:
+        hidden_size: The hidden size of the self attention.
+        intermediate_size: size of the intermediate state after
+                           self attention. In both BERT and GPT
+                           this is set to be 4 times the hidden
+                           size.
+        num_attention_heads: number of attention head in the self
+                             attention.
+        attention_dropout_prob: dropout probability of the attention
+                                score in self attention.
+        output_dropout_prob: dropout probability for the outputs
+                             after self attention and final output.
+        intermediate_activation_fn: activation function for output
+                                    of intermediate.
+        layernorm_epsilon: epsilon used in layernorm to avoid
+                           division by zero.
+        init_method: initialization method used for the weights. Note
+                     that all biases are initialized to zero and
+                     layernorm weight are initialized to one.
+    """
+
+    def __init__(self,
+                 hidden_size,
+                 intermediate_size,
+                 num_attention_heads,
+                 attention_dropout_prob,
+                 output_dropout_prob,
+                 intermediate_activation_fn,
+                 layernorm_epsilon,
+                 init_method=init.xavier_normal_):
+        super(BertParallelTransformerLayer, self).__init__()
+
+        # Self attention.
+        self.attention = BertParallelSelfAttention(
+            hidden_size,
+            num_attention_heads,
+            attention_dropout_prob,
+            output_parallel=True,
+            init_method=init_method)
+        # Self attention output.
+        self.self_output = BertParallelTransformerOutput(
+            hidden_size,
+            hidden_size,
+            output_dropout_prob,
+            layernorm_epsilon=layernorm_epsilon,
+            input_is_parallel=True,
+            init_method=init_method)
+        # Intermediate.
+        self.intermediate = ColumnParallelLinear(
+            hidden_size,
+            intermediate_size,
+            gather_output=False,
+            init_method=init_method)
+        self.intermediate_activation_fn = intermediate_activation_fn
+        # Output.
+        self.output = BertParallelTransformerOutput(
+            intermediate_size,
+            hidden_size,
+            output_dropout_prob,
+            layernorm_epsilon=layernorm_epsilon,
+            input_is_parallel=True,
+            init_method=init_method)
+
+    def forward(self, hidden_states, attention_mask):
+        # [b, s, hp]
+        attention_output_parallel = self.attention(hidden_states,
+                                                   attention_mask)
+        # [b, s, h]
+        attention_self_output = self.self_output(attention_output_parallel,
+                                                 hidden_states)
+        # [b, s, ip]
+        intermediate_output_parallel = self.intermediate(attention_self_output)
+        intermediate_output_parallel = self.intermediate_activation_fn(
+            intermediate_output_parallel)
+        # [b, s, h]
+        layer_output = self.output(intermediate_output_parallel,
+                                   attention_self_output)
+
+        return layer_output
diff --git a/modelscope/models/nlp/mglm/mpu/utils.py b/modelscope/models/nlp/mglm/mpu/utils.py
new file mode 100644
index 00000000..76c37a2b
--- /dev/null
+++ b/modelscope/models/nlp/mglm/mpu/utils.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+
+def ensure_divisibility(numerator, denominator):
+    """Ensure that numerator is divisible by the denominator."""
+    assert numerator % denominator == 0, '{} is not divisible by {}'.format(
+        numerator, denominator)
+
+
+def divide(numerator, denominator):
+    """Ensure that numerator is divisible by the denominator and return
+    the division value."""
+    ensure_divisibility(numerator, denominator)
+    return numerator // denominator
+
+
+def split_tensor_along_last_dim(tensor,
+                                num_partitions,
+                                contiguous_split_chunks=False):
+    """Split a tensor along its last dimension.
+    Arguments:
+        tensor: input tensor.
+        num_partitions: number of partitions to split the tensor
+        contiguous_split_chunks: If True, make each chunk contiguous
+                                 in memory.
+    """
+    # Get the size and dimension.
+    last_dim = tensor.dim() - 1
+    last_dim_size = divide(tensor.size()[last_dim], num_partitions)
+    # Split.
+    tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
+    # Note: torch.split does not create contiguous tensors by default.
+    if contiguous_split_chunks:
+        return tuple(chunk.contiguous() for chunk in tensor_list)
+
+    return tensor_list
+
+
+class VocabUtility:
+    """Split the vocabulary into `world_size` chunks amd return the
+        first and last index of the vocabulary belonging to the `rank`
+        partition: Note that indecies in [fist, last)"""
+
+    @staticmethod
+    def vocab_range_from_per_partition_vocab_size(per_partition_vocab_size,
+                                                  rank, world_size):
+        index_f = rank * per_partition_vocab_size
+        index_l = index_f + per_partition_vocab_size
+        return index_f, index_l
+
+    @staticmethod
+    def vocab_range_from_global_vocab_size(global_vocab_size, rank,
+                                           world_size):
+        per_partition_vocab_size = divide(global_vocab_size, world_size)
+        return VocabUtility.vocab_range_from_per_partition_vocab_size(
+            per_partition_vocab_size, rank, world_size)
diff --git a/modelscope/models/nlp/mglm/process_grid.py b/modelscope/models/nlp/mglm/process_grid.py
new file mode 100644
index 00000000..d425c970
--- /dev/null
+++ b/modelscope/models/nlp/mglm/process_grid.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2022 Zhipu.AI
+
+import glob
+import os
+import statistics
+import sys
+
+import json
+
+path_pattern = sys.argv[1]
+target_type = sys.argv[2]
+best_value, best_result, best_name = None, None, None
+mean_result = {}
+print(path_pattern)
+for dir_path in glob.glob(path_pattern, recursive=True):
+    entry = os.path.basename(dir_path)
+    valid_result = None
+    test_found = os.path.exists(os.path.join(dir_path, 'test_results.json'))
+    valid_path = os.path.join(dir_path, 'results.json')
+    if os.path.exists(valid_path):
+        print(entry)
+        with open(valid_path) as file:
+            valid_result = json.load(file)
+    else:
+        print(f'{entry} no validation results')
+        continue
+    if not test_found:
+        print(f'{entry} not tested yet')
+    if target_type == 'max':
+        metric = sys.argv[3]
+        metric_value = valid_result[metric]
+        if best_value is None or metric_value > best_value:
+            best_value = metric_value
+            best_result = valid_result
+            best_name = entry
+    elif target_type == 'mean' or target_type == 'median':
+        if mean_result:
+            for metric, value in valid_result.items():
+                if metric not in ['type', 'epoch']:
+                    mean_result[metric].append(value)
+        else:
+            mean_result = {
+                metric: [value]
+                for metric, value in valid_result.items()
+                if metric not in ['type', 'epoch']
+            }
+
+if target_type == 'max':
+    print(f'Best result found at {best_name}: {best_result}')
+elif target_type == 'mean':
+    mean_result = {
+        metric: sum(value) / len(value)
+        for metric, value in mean_result.items()
+    }
+    print(f'Mean result {mean_result}')
+elif target_type == 'median':
+    mean_result = {
+        metric: statistics.median(value)
+        for metric, value in mean_result.items()
+    }
+    print(f'Mean result {mean_result}')
diff --git a/modelscope/models/nlp/mglm/requirements.txt b/modelscope/models/nlp/mglm/requirements.txt
new file mode 100644
index 00000000..e44ae5d1
--- /dev/null
+++ b/modelscope/models/nlp/mglm/requirements.txt
@@ -0,0 +1,22 @@
+boto3
+botocore
+deepspeed
+fasttext
+filelock
+ftfy
+langdetect
+lsh
+matplotlib
+mpi4py
+nltk
+pandas
+regex
+requests
+rouge_score
+scikit_learn
+scipy
+sentencepiece
+termcolor
+tldextract
+tqdm
+transformers
diff --git a/modelscope/models/nlp/mglm/run_test.py b/modelscope/models/nlp/mglm/run_test.py
new file mode 100644
index 00000000..2f568265
--- /dev/null
+++ b/modelscope/models/nlp/mglm/run_test.py
@@ -0,0 +1,10 @@
+# Copyright (c) 2022 Zhipu.AI
+
+import sys
+
+if sys.argv[1] == 'block':
+    from test.test_block import main
+    main()
+elif sys.argv[1] == 'rel_shift':
+    from test.test_rel_shift import main
+    main()
diff --git a/modelscope/models/nlp/mglm/tasks/data_utils.py b/modelscope/models/nlp/mglm/tasks/data_utils.py
new file mode 100644
index 00000000..179d304e
--- /dev/null
+++ b/modelscope/models/nlp/mglm/tasks/data_utils.py
@@ -0,0 +1,389 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Tasks data utility."""
+import copy
+import pickle
+import re
+from typing import Dict, List, Optional
+
+import json
+import numpy as np
+import torch
+import torch.utils.data
+from torch.utils.data.dataloader import default_collate
+
+from modelscope.models.nlp.mglm import mpu
+
+
+def clean_text(text):
+    """Remove new lines and multiple spaces and adjust end of sentence dot."""
+
+    text = text.replace('\n', ' ')
+    text = re.sub(r'\s+', ' ', text)
+    for _ in range(3):
+        text = text.replace(' . ', '. ')
+
+    return text
+
+
+class InputExample(object):
+    """A raw input example consisting of one or two segments of text and a label"""
+
+    def __init__(self,
+                 guid,
+                 text_a,
+                 text_b=None,
+                 label=None,
+                 logits=None,
+                 meta: Optional[Dict] = None,
+                 idx=-1,
+                 num_choices=1):
+        """
+        Create a new InputExample.
+
+        :param guid: a unique textual identifier
+        :param text_a: the sequence of text
+        :param text_b: an optional, second sequence of text
+        :param label: an optional label
+        :param logits: an optional list of per-class logits
+        :param meta: an optional dictionary to store arbitrary meta information
+        :param idx: an optional numeric index
+        """
+        self.guid = guid
+        self.text_a = text_a
+        self.text_b = text_b
+        self.label = label
+        self.logits = logits
+        self.idx = idx
+        self.num_choices = num_choices
+        self.meta = meta if meta else {}
+
+    def __repr__(self):
+        return str(self.to_json_string())
+
+    def to_dict(self):
+        """Serialize this instance to a Python dictionary."""
+        output = copy.deepcopy(self.__dict__)
+        return output
+
+    def to_json_string(self):
+        """Serialize this instance to a JSON string."""
+        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + '\n'
+
+    @staticmethod
+    def load_examples(path: str) -> List['InputExample']:
+        """Load a set of input examples from a file"""
+        with open(path, 'rb') as fh:
+            return pickle.load(fh)
+
+    @staticmethod
+    def save_examples(examples: List['InputExample'], path: str) -> None:
+        """Save a set of input examples to a file"""
+        with open(path, 'wb') as fh:
+            pickle.dump(examples, fh)
+
+
+def num_special_tokens_to_add(text_a_ids,
+                              text_b_ids,
+                              answer_ids,
+                              add_cls,
+                              add_sep,
+                              add_piece,
+                              add_eos=True):
+    num_tokens = 0
+    if add_cls:
+        num_tokens += 1
+    if text_b_ids and add_sep:
+        num_tokens += 1
+    if add_eos:
+        num_tokens += 1
+    if not answer_ids and add_piece:
+        num_tokens += 1
+    return num_tokens
+
+
+def build_input_from_ids(text_a_ids,
+                         text_b_ids,
+                         answer_ids,
+                         max_seq_length,
+                         tokenizer,
+                         args=None,
+                         add_cls=True,
+                         add_sep=False,
+                         add_piece=False,
+                         add_eos=True,
+                         mask_id=None):
+    if mask_id is None:
+        mask_id = tokenizer.get_command('MASK').Id
+    eos_id = tokenizer.get_command('eos').Id
+    cls_id = tokenizer.get_command('ENC').Id
+    sep_id = tokenizer.get_command('sep').Id
+    ids = []
+    types = []
+    paddings = []
+    # CLS
+    if add_cls:
+        ids.append(cls_id)
+        types.append(0)
+        paddings.append(1)
+    # A
+    len_text_a = len(text_a_ids)
+    ids.extend(text_a_ids)
+    types.extend([0] * len_text_a)
+    paddings.extend([1] * len_text_a)
+    # B
+    if text_b_ids is not None:
+        # SEP
+        if add_sep:
+            ids.append(sep_id)
+            types.append(0)
+            paddings.append(1)
+        len_text_b = len(text_b_ids)
+        ids.extend(text_b_ids)
+        types.extend([1] * len_text_b)
+        paddings.extend([1] * len_text_b)
+    eos_length = 1 if add_eos else 0
+    # Cap the size.
+    if len(ids) >= max_seq_length - eos_length:
+        max_seq_length_m1 = max_seq_length - 1
+        ids = ids[0:max_seq_length_m1]
+        types = types[0:max_seq_length_m1]
+        paddings = paddings[0:max_seq_length_m1]
+    end_type = 0 if text_b_ids is None else 1
+    if add_eos:
+        ids.append(eos_id)
+        types.append(end_type)
+        paddings.append(1)
+    sep = len(ids)
+    target_ids = [0] * len(ids)
+    loss_masks = [0] * len(ids)
+    position_ids = list(range(len(ids)))
+    block_position_ids = [0] * len(ids)
+    # Piece
+    if add_piece or answer_ids is not None:
+        sop_id = tokenizer.get_command('sop').Id
+        mask_position = ids.index(
+            mask_id
+        ) if not args.sentinel_token else args.max_position_embeddings
+        ids.append(sop_id)
+        types.append(end_type)
+        paddings.append(1)
+        position_ids.append(mask_position)
+        block_position_ids.append(1)
+        if answer_ids is not None:
+            len_answer = len(answer_ids)
+            ids.extend(answer_ids[:-1])
+            types.extend([end_type] * (len_answer - 1))
+            paddings.extend([1] * (len_answer - 1))
+            position_ids.extend([mask_position] * (len_answer - 1))
+            if not args.no_block_position:
+                block_position_ids.extend(range(2, len(answer_ids) + 1))
+            else:
+                block_position_ids.extend([1] * (len(answer_ids) - 1))
+            target_ids.extend(answer_ids)
+            loss_masks.extend([1] * len(answer_ids))
+        else:
+            target_ids.append(0)
+            loss_masks.append(1)
+    # Padding.
+    padding_length = max_seq_length - len(ids)
+    if padding_length > 0:
+        ids.extend([eos_id] * padding_length)
+        types.extend([eos_id] * padding_length)
+        paddings.extend([0] * padding_length)
+        position_ids.extend([0] * padding_length)
+        block_position_ids.extend([0] * padding_length)
+        target_ids.extend([0] * padding_length)
+        loss_masks.extend([0] * padding_length)
+    if not args.masked_lm:
+        position_ids = [position_ids, block_position_ids]
+    return ids, types, paddings, position_ids, sep, target_ids, loss_masks
+
+
+def build_decoder_input(enc_ids, answer_ids, max_seq_length,
+                        max_dec_seq_length, tokenizer):
+    mask_id = tokenizer.get_command('MASK').Id
+    eos_id = tokenizer.get_command('eos').Id
+    sop_id = tokenizer.get_command('sop').Id
+    enc_len = len(enc_ids)  # noqa
+    masks = []
+    # TODO: it probably takes too much memory
+    # for i in range(max_dec_seq_length):
+    #     m = [1]*enc_len + [0]*(max_seq_length - enc_len) + [1]*(i+1) + [0]*(max_dec_seq_length-1-i)
+    #     masks.append(m)
+    mask_position = enc_ids.index(mask_id)
+    len_answer = len(answer_ids)
+    ids = [sop_id] + answer_ids[:-1]
+    types = [0] * len_answer  # not used
+    paddings = [1] * len_answer
+    position_ids = [mask_position] * len_answer
+    block_position_ids = list(range(1, len_answer + 1))
+    target_ids = answer_ids
+    loss_masks = [1] * len_answer
+    # Padding.
+    padding_length = max_dec_seq_length - len(ids)
+    if padding_length > 0:
+        ids.extend([eos_id] * padding_length)
+        types.extend([0] * padding_length)
+        paddings.extend([0] * padding_length)
+        position_ids.extend([0] * padding_length)
+        block_position_ids.extend([0] * padding_length)
+        target_ids.extend([0] * padding_length)
+        loss_masks.extend([0] * padding_length)
+    position_ids = [position_ids, block_position_ids]
+    return ids, types, paddings, position_ids, masks, target_ids, loss_masks
+
+
+def build_sample(ids,
+                 types=None,
+                 paddings=None,
+                 positions=None,
+                 masks=None,
+                 label=None,
+                 unique_id=None,
+                 target=None,
+                 logit_mask=None,
+                 segment_ids=None,
+                 prompt_ids=None):
+    """Convert to numpy and return a sample consumed by the batch producer."""
+
+    ids_np = np.array(ids, dtype=np.int64)
+    sample = {'text': ids_np, 'label': int(label)}
+    if types is not None:
+        types_np = np.array(types, dtype=np.int64)
+        sample['types'] = types_np
+    if paddings is not None:
+        paddings_np = np.array(paddings, dtype=np.int64)
+        sample['padding_mask'] = paddings_np
+    if positions is not None:
+        positions_np = np.array(positions, dtype=np.int64)
+        sample['position'] = positions_np
+    if masks is not None:
+        masks_np = np.array(masks, dtype=np.int64)
+        sample['mask'] = masks_np
+    if target is not None:
+        target_np = np.array(target, dtype=np.int64)
+        sample['target'] = target_np
+    if logit_mask is not None:
+        logit_mask_np = np.array(logit_mask, dtype=np.int64)
+        sample['logit_mask'] = logit_mask_np
+    if segment_ids is not None:
+        segment_ids = np.array(segment_ids, dtype=np.int64)
+        sample['segment_id'] = segment_ids
+    if prompt_ids is not None:
+        prompt_ids = np.array(prompt_ids, dtype=np.int64)
+        sample['prompt_pos'] = prompt_ids
+    if unique_id is not None:
+        sample['uid'] = unique_id
+    return sample
+
+
+def build_decoder_sample(sample, dec_ids, dec_position, dec_masks, dec_target,
+                         dec_logit_mask):
+    sample['dec_text'] = np.array(dec_ids)
+    sample['dec_position'] = np.array(dec_position)
+    sample['dec_mask'] = np.array(dec_masks)
+    sample['dec_target'] = np.array(dec_target)
+    sample['dec_logit_mask'] = np.array(dec_logit_mask)
+    return sample
+
+
+def my_collate(batch):
+    new_batch = [{key: value
+                  for key, value in sample.items() if key != 'uid'}
+                 for sample in batch]
+    text_list = [sample['text'] for sample in batch]
+
+    def pad_choice_dim(data, choice_num):
+        if len(data) < choice_num:
+            data = np.concatenate([data]
+                                  + [data[0:1]] * (choice_num - len(data)))
+        return data
+
+    if len(text_list[0].shape) == 2:
+        choice_nums = list(map(len, text_list))
+        max_choice_num = max(choice_nums)
+        for i, sample in enumerate(new_batch):
+            for key, value in sample.items():
+                if key != 'label':
+                    sample[key] = pad_choice_dim(value, max_choice_num)
+                else:
+                    sample[key] = value
+            sample['loss_mask'] = np.array(
+                [1] * choice_nums[i] + [0] * (max_choice_num - choice_nums[i]),
+                dtype=np.int64)
+
+    if 'dec_text' in new_batch[0]:
+        choice_nums = [len(sample['dec_text']) for sample in new_batch]
+        if choice_nums.count(choice_nums[0]) != len(choice_nums):
+            max_choice_num = max(choice_nums)
+            for i, sample in enumerate(new_batch):
+                for key, value in sample.items():
+                    if key.startswith('dec_'):
+                        sample[key] = pad_choice_dim(value, max_choice_num)
+                sample['loss_mask'] = np.array(
+                    [1] * choice_nums[i] + [0] *  # noqa
+                    (max_choice_num - choice_nums[i]),
+                    dtype=np.int64)
+
+    new_batch = default_collate(new_batch)
+    if 'uid' in batch[0]:
+        uid_list = [sample['uid'] for sample in batch]
+        new_batch['uid'] = uid_list
+    return new_batch
+
+
+class FakeDataloader:
+
+    def __init__(self, num_iters):
+        self.num_iters = num_iters
+
+    def __iter__(self):
+        if self.num_iters is not None:
+            for _ in range(self.num_iters):
+                yield None
+        else:
+            while True:
+                yield None
+
+
+def build_data_loader(dataset,
+                      batch_size,
+                      num_workers,
+                      drop_last,
+                      shuffle=True,
+                      only_rank0=False):
+    """Data loader. Note that batch-size is the local (per GPU) batch-size."""
+
+    # Sampler.
+    if only_rank0:
+        rank, world_size = 0, 1
+    else:
+        world_size = mpu.get_data_parallel_world_size()
+        rank = mpu.get_data_parallel_rank()
+    sampler = torch.utils.data.distributed.DistributedSampler(
+        dataset, num_replicas=world_size, rank=rank, shuffle=shuffle)
+
+    # Data loader. Note that batch size is the per GPU batch size.
+    data_loader = torch.utils.data.DataLoader(
+        dataset,
+        batch_size=batch_size,
+        sampler=sampler,
+        shuffle=False,
+        num_workers=num_workers,
+        drop_last=drop_last,
+        pin_memory=True,
+        collate_fn=my_collate)
+
+    return data_loader
diff --git a/modelscope/models/nlp/mglm/tasks/eval_utils.py b/modelscope/models/nlp/mglm/tasks/eval_utils.py
new file mode 100644
index 00000000..da23a884
--- /dev/null
+++ b/modelscope/models/nlp/mglm/tasks/eval_utils.py
@@ -0,0 +1,249 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Evaluation utilities."""
+
+import datetime
+import os
+import random
+import time
+from collections import OrderedDict
+from typing import List
+
+import mpu
+import torch
+from finetune_glm import process_batch
+from sklearn.metrics import f1_score
+from tasks.data_utils import InputExample, build_data_loader
+from utils import debug_finetune_data, get_spare_port, print_rank_0
+
+
+def accuracy_metric(predictions, labels, examples):
+    count = 0
+    num_predictions = max(len(predictions), 1)
+    assert len(predictions) == len(labels)
+    for prediction, label in zip(predictions, labels):
+        count += prediction == label
+    return count * 100.0 / num_predictions
+
+
+def f1_metric(predictions, labels, examples):
+    return f1_score(labels, predictions)
+
+
+def f1_macro_metric(predictions, labels, examples):
+    return f1_score(labels, predictions, average='macro')
+
+
+global_tokenizer = None
+
+
+def accuracy_func_provider(single_dataset_provider,
+                           metric_dict,
+                           args,
+                           is_test=False,
+                           eval_func=None,
+                           output_func=None,
+                           only_rank0=True,
+                           tokenizer=None):
+    """Provide function that calculates accuracies."""
+    # Build dataloaders.
+    global global_tokenizer
+    global_tokenizer = tokenizer
+    if only_rank0 and torch.distributed.is_initialized(
+    ) and torch.distributed.get_rank() != 0:
+        return None
+    if is_test and not args.eval_valid:
+        datapaths = args.test_data if args.test_data is not None else ['test']
+    else:
+        datapaths = args.valid_data if args.valid_data is not None else ['dev']
+    if eval_func is None:
+        eval_func = multichoice_evaluate
+    dataloaders = []
+    eval_batch_size = args.eval_batch_size if args.eval_batch_size else args.batch_size
+    for datapath in datapaths:
+        dataset = single_dataset_provider(datapath)
+        dataloader = build_data_loader(
+            dataset,
+            eval_batch_size,
+            num_workers=args.num_workers,
+            drop_last=False,
+            shuffle=False,
+            only_rank0=only_rank0)
+        dataloaders.append((dataset.dataset_name, dataloader))
+
+    def metrics_func(model,
+                     epoch,
+                     output_predictions=False,
+                     summary_writer=None):
+        print_rank_0('calculating metrics ...')
+        score_dict = OrderedDict([(key, 0.0) for key in metric_dict
+                                  ]) if isinstance(metric_dict, dict) else {
+                                      metric_dict: 0.0
+                                  }  # noqa
+        total = 0
+        for name, dataloader in dataloaders:
+            example_dict = None
+            if hasattr(dataloader.dataset, 'examples'):
+                example_dict = dataloader.dataset.examples
+            start_time = time.time()
+            predictions, labels, examples = eval_func(model, dataloader,
+                                                      example_dict, args)
+            elapsed_time = time.time() - start_time
+            if output_predictions and torch.distributed.get_rank() == 0:
+                filename = os.path.join(args.log_dir, name + '.jsonl')
+                output_func(predictions, examples, filename)
+            total_count = len(predictions)
+            single_dict = {
+                key: metric(predictions, labels, examples)
+                for key, metric in metric_dict.items()
+            }
+            output_str = ' > |epoch: {}| metrics for {}: total {}'.format(
+                epoch, name, total_count)
+            for key, value in single_dict.items():
+                output_str += ' {} = {:.4f} %'.format(key, value)
+                if summary_writer is not None and epoch >= 0 and not is_test and len(
+                        dataloaders) > 1:
+                    summary_writer.add_scalar(f'Train/valid_{name}_{key}',
+                                              value, epoch)
+            output_str += ' elapsed time (sec): {:.3f}'.format(elapsed_time)
+            if len(dataloaders) > 1:
+                print_rank_0(output_str)
+            for key in score_dict:
+                score_dict[key] += single_dict[key] * total_count
+            total += total_count
+        score_dict = {
+            key: score / float(total)
+            for key, score in score_dict.items()
+        }
+        output_str = ' >> |epoch: {}| overall: total = {}'.format(epoch, total)
+        for key, score in score_dict.items():
+            output_str += ' {} = {:.4f}'.format(key, score)
+            if summary_writer is not None and epoch >= 0 and not is_test:
+                summary_writer.add_scalar(f'Train/valid_{key}', score, epoch)
+        print_rank_0(output_str)
+        return score_dict
+
+    return metrics_func
+
+
+segment_length = 10
+
+
+def multichoice_evaluate(model, dataloader, example_dict, args):
+    """Calculate correct over total answers and return prediction if the
+    `output_predictions` is true."""
+    model.eval()
+    port = get_spare_port(args)
+    print_rank_0(f'Using port {port}')
+    store = torch.distributed.TCPStore(args.master_ip, port,
+                                       torch.distributed.get_world_size(),
+                                       torch.distributed.get_rank() == 0,
+                                       datetime.timedelta(seconds=30))
+    # file_path = os.path.join("/cache", args.experiment_name + "_store")
+    # print_rank_0(f"Using file store at {file_path}")
+    # store = torch.distributed.FileStore(file_path, torch.distributed.get_world_size())
+    with torch.no_grad():
+        # For all the batches in the dataset.
+        for _, batch in enumerate(dataloader):
+            # Run the model forward.
+            data = process_batch(batch, args)
+            if args.pretrained_bert:
+                tokens, types, labels_, attention_mask = data['text'], data[
+                    'types'], data['label'], data['padding_mask']
+                inputs = [tokens, types, attention_mask]
+            elif args.cloze_eval:
+                tokens, labels_, position_ids = data['text'], data[
+                    'label'], data['position']
+                attention_mask, target_ids, logit_mask = data['mask'], data[
+                    'target'], data['logit_mask']
+                if not args.fast_decode:
+                    inputs = [
+                        tokens, position_ids, attention_mask, target_ids,
+                        logit_mask
+                    ]
+                    if args.continuous_prompt:
+                        prompt_pos = data['prompt_pos']
+                        inputs.append(prompt_pos)
+                else:
+                    dec_input_ids, dec_position_ids, dec_attention_mask = data[
+                        'dec_text'], data['dec_position'], data['dec_mask']
+                    dec_target_ids, dec_logit_mask = data['dec_target'], data[
+                        'dec_logit_mask']
+                    inputs = [
+                        tokens, position_ids, attention_mask, dec_input_ids,
+                        dec_position_ids, dec_attention_mask, dec_target_ids,
+                        dec_logit_mask
+                    ]
+            else:
+                tokens, labels_, position_ids, attention_mask = data[
+                    'text'], data['label'], data['position'], data['mask']
+                inputs = [tokens, position_ids, attention_mask]
+            if len(inputs[0].shape
+                   ) == 3 and inputs[0].size(1) > segment_length:
+                logit_list = []
+                for i in range((inputs[0].size(1) - 1) // segment_length + 1):
+                    input_batch = [
+                        arg[:, i * segment_length:(i + 1) * segment_length]
+                        for arg in inputs
+                    ]
+                    if args.pretrained_bert:
+                        logits = model(*input_batch)
+                    else:
+                        logits, *mems = model(*input_batch)
+                    logit_list.append(logits)
+                logits = torch.cat(logit_list, dim=1)
+            elif args.cloze_eval and args.fast_decode:
+                logit_list = []
+                num_choices = inputs[3].size(1)
+                for i in range((num_choices - 1) // segment_length + 1):
+                    input_batch = inputs[:3] + [
+                        arg[:, i * segment_length:(i + 1) * segment_length]
+                        for arg in inputs[3:]
+                    ]
+                    logits, *mems = model(*input_batch)
+                    logit_list.append(logits)
+                logits = torch.cat(logit_list, dim=1)
+            else:
+                if args.pretrained_bert:
+                    logits = model(*inputs)
+                else:
+                    logits, *mems = model(*inputs)
+            if 'segment_id' in data:
+                from torch_scatter import scatter_sum
+                if 'loss_mask' in data:
+                    logits = logits * data['loss_mask']
+                logits = scatter_sum(logits, data['segment_id'], dim=1)
+            elif 'loss_mask' in data:
+                loss_mask = data['loss_mask']
+                logits = logits * loss_mask - 10000.0 * (1.0 - loss_mask)
+            uid_list = batch['uid']
+            if isinstance(uid_list, torch.Tensor):
+                uid_list = uid_list.cpu().numpy().tolist()
+            predicted = torch.argmax(logits, dim=-1).tolist()
+            labels = labels_.tolist()
+            if args.task.lower() == 'wsc':
+                predicted = [1 if pred == 0 else 0 for pred in predicted]
+            if mpu.get_model_parallel_rank() == 0:
+                for uid, prediction, label in zip(uid_list, predicted, labels):
+                    store.set(uid, str((prediction, label)))
+    model.train()
+    torch.distributed.barrier()
+    predictions, labels, examples = [], [], []
+    for uid, example in example_dict.items():
+        prediction, label = eval(store.get(uid))
+        predictions.append(prediction)
+        labels.append(label)
+        examples.append(example)
+    torch.distributed.barrier()
+    return predictions, labels, examples
diff --git a/modelscope/models/nlp/mglm/tasks/language_model/dataset.py b/modelscope/models/nlp/mglm/tasks/language_model/dataset.py
new file mode 100644
index 00000000..cfdfa714
--- /dev/null
+++ b/modelscope/models/nlp/mglm/tasks/language_model/dataset.py
@@ -0,0 +1,249 @@
+# Copyright (c) 2022 Zhipu.AI
+
+import math
+from bisect import bisect_right
+from itertools import accumulate
+
+import json
+import numpy as np
+import torch
+from tasks.data_utils import build_input_from_ids, num_special_tokens_to_add
+from tasks.language_model.detokenizer import get_detokenizer
+from utils import print_rank_0
+
+
+class LMDataset(torch.utils.data.Dataset):
+
+    def __init__(self, args, documents, tokenizer, num_original_tokens,
+                 num_tokenized_tokens):
+        self.args = args
+        self.documents = documents
+        self.max_seq_len = args.seq_length - 1
+        self.tokenizer = tokenizer
+        self.overalapping_eval = args.overlapping_eval
+        if self.overalapping_eval is None:
+            self.overalapping_eval = self.max_seq_len
+        self.overalapping_eval = max(1, self.overalapping_eval)
+        self.num_original_tokens = num_original_tokens
+        self.num_tokenized_tokens = num_tokenized_tokens
+        # remove first sequence tokens
+        targets = [
+            max(len(tokens) - self.max_seq_len, 0) for tokens in self.documents
+        ]
+        self.num_sequences = [
+            max(math.ceil(target / self.overalapping_eval) + 1, 1)
+            for target in targets
+        ]
+        self.weights = list(accumulate(self.num_sequences))
+        self.left_weights = [0] + self.weights[:-1]
+        self.unidirectional = args.unidirectional
+        self.block_lm = args.block_lm
+        mask_token = 'gMASK' if args.task_mask else 'MASK'
+        self.mask_id = self.tokenizer.get_command(mask_token).Id
+
+    def __len__(self):
+        return sum(self.num_sequences)
+
+    def __getitem__(self, idx):
+        document_idx = bisect_right(self.weights, idx)
+        idx = idx - self.left_weights[document_idx]
+        start_idx = idx * self.overalapping_eval
+        end_idx = start_idx + self.max_seq_len
+        tokens = self.documents[document_idx][start_idx:end_idx]
+        if self.block_lm:
+            if idx == 0 or self.unidirectional:
+                prompt, text = tokens[:1], tokens[1:]
+            else:
+                prompt_length = self.max_seq_len - self.overalapping_eval
+                prompt, text = tokens[:prompt_length], tokens[prompt_length:]
+            prompt = prompt + [self.mask_id]
+            num_special_tokens = num_special_tokens_to_add(
+                prompt,
+                None,
+                text,
+                add_cls=True,
+                add_sep=False,
+                add_piece=True,
+                add_eos=False)
+            data = build_input_from_ids(
+                prompt,
+                None,
+                text,
+                self.max_seq_len + num_special_tokens + 1,
+                self.tokenizer,
+                args=self.args,
+                add_cls=True,
+                add_sep=False,
+                add_piece=True,
+                add_eos=False,
+                mask_id=self.mask_id)
+            ids, types, paddings, position_ids, sep, target_ids, loss_masks = data
+            if idx != 0 and self.unidirectional:
+                loss_masks = np.array(loss_masks, dtype=np.int64)
+                loss_masks[:-self.overalapping_eval] = 0
+            return {
+                'text': np.array(ids, dtype=np.int64),
+                'target': np.array(target_ids, dtype=np.int64),
+                'attention_mask': np.array(sep, dtype=np.int64),
+                'loss_mask': np.array(loss_masks, dtype=np.int64),
+                'position_id': np.array(position_ids, dtype=np.int64)
+            }
+        else:
+            loss_masks = [1] * len(tokens)
+            if len(tokens) < self.max_seq_len:
+                tokens = tokens + [0] * (self.max_seq_len - len(tokens))
+                loss_masks = loss_masks + [0] * (
+                    self.max_seq_len - len(loss_masks))
+            if idx != 0:
+                loss_masks = np.array(loss_masks, dtype=np.int64)
+                loss_masks[:-self.overalapping_eval] = 0
+            return {
+                'text': np.array(tokens, dtype=np.int64),
+                'loss_mask': np.array(loss_masks, dtype=np.int64)
+            }
+
+
+class LambadaDataset(torch.utils.data.Dataset):
+
+    def __init__(self, args, tokenizer, strict=True):
+        data_path = args.valid_data[0]
+        print_rank_0(
+            '> building lambada dataset from {} ...'.format(data_path))
+        self.args = args
+        self.max_seq_length = args.seq_length
+        self.tokenizer = tokenizer
+        self.pad_idx = tokenizer.get_command('pad').Id
+        self.strict = strict
+        self.block_lm = args.block_lm
+        self.unidirectional = args.unidirectional
+        mask_token = 'gMASK' if args.task_mask else 'MASK'
+        self.mask_id = self.tokenizer.get_command(mask_token).Id
+
+        self.tokens = []
+        self.labels = []
+        with open(data_path, 'r') as f:
+            for line in f.readlines():
+                text = json.loads(line)['text']
+                tokens, labels = self.get_tokens(text)
+                self.tokens.append(tokens)
+                self.labels.append(labels)
+
+    def get_tokens(self, text):
+        if not self.strict:
+            tokens = self.tokenizer.EncodeAsIds(text).tokenization
+            return tokens[:-1], [tokens[-1]]
+        last_token = text.split()[-1]
+        start_idx = text.rfind(last_token)
+        beginning_tokens = self.tokenizer.EncodeAsIds(
+            text[:start_idx].strip()).tokenization
+        last_token = self.tokenizer.EncodeAsIds(' ' + last_token).tokenization
+        return beginning_tokens, last_token
+
+    def __len__(self):
+        return len(self.tokens)
+
+    def __getitem__(self, idx):
+        tokens, answer = self.tokens[idx], self.labels[idx]
+        if self.block_lm:
+            if self.unidirectional:
+                tokens, answer_tokens = tokens[:1], tokens[1:] + answer
+            else:
+                answer_tokens = answer
+            tokens = tokens + [self.mask_id]
+            num_special_tokens = num_special_tokens_to_add(
+                tokens,
+                None,
+                answer_tokens,
+                add_cls=True,
+                add_sep=False,
+                add_piece=True)
+            left_shift = len(tokens) + len(
+                answer_tokens) + num_special_tokens - self.max_seq_length
+            if left_shift > 0:
+                tokens = tokens[left_shift:]
+            data = build_input_from_ids(
+                tokens,
+                None,
+                answer_tokens,
+                self.max_seq_length,
+                self.tokenizer,
+                args=self.args,
+                add_cls=True,
+                add_sep=False,
+                add_piece=True,
+                mask_id=self.mask_id)
+            ids, types, paddings, position_ids, sep, target_ids, loss_masks = data
+            if self.unidirectional:
+                loss_masks = np.array(loss_masks, dtype=np.int64)
+                last_index = len(loss_masks)
+                while loss_masks[last_index - 1] == 0:
+                    last_index -= 1
+                loss_masks[:last_index - len(answer)] = 0
+            return {
+                'text': np.array(ids, dtype=np.int64),
+                'target': np.array(target_ids, dtype=np.int64),
+                'attention_mask': np.array(sep, dtype=np.int64),
+                'loss_mask': np.array(loss_masks, dtype=np.int64),
+                'position_id': np.array(position_ids, dtype=np.int64)
+            }
+        else:
+            left_shift = len(tokens) - self.max_seq_length
+            if left_shift > 0:
+                tokens = tokens[left_shift:]
+            ids = tokens + answer
+            if len(ids) < self.max_seq_length:
+                ids = ids + [0] * (self.max_seq_length - len(ids))
+            loss_masks = [0] * len(tokens) + [1] * len(answer)
+            if len(loss_masks) < self.max_seq_length:
+                loss_masks = loss_masks + [0] * (
+                    self.max_seq_length - len(loss_masks))
+            return {
+                'text': np.array(ids, dtype=np.int64),
+                'loss_mask': np.array(loss_masks, dtype=np.int64)
+            }
+
+
+def build_lambada_dataset(tokenizer, args):
+    """Build lambada dataset."""
+    assert len(args.valid_data) == 1
+    val_dataset = LambadaDataset(args, tokenizer, strict=True)
+    print_rank_0(' > found {} samples, {} label tokens.'.format(
+        len(val_dataset), sum(map(len, val_dataset.labels))))
+    return val_dataset
+
+
+def build_lm_dataset(tokenizer, args):
+    documents = []
+    num_tokens, num_original_tokens = 0, 0
+    with open(args.valid_data[0], encoding='utf-8') as file:
+        for line in file:
+            tokens = tokenizer.EncodeAsIds(line.strip()).tokenization
+            num_tokens += len(tokens)
+            num_original_tokens += len(line.strip().split(' '))
+            documents.append(tokens)
+    val_dataset = LMDataset(args, documents, tokenizer, num_original_tokens,
+                            num_tokens)
+    print_rank_0(
+        ' > number of document: {}, number of original tokens {}, number of detokenized tokens: {}'
+        .format(len(documents), num_original_tokens, num_tokens))
+    return val_dataset
+
+
+def build_wikitext103_dataset(tokenizer, args):
+    """"""
+
+    assert len(args.valid_data) == 1
+    with open(args.valid_data[0], 'rb') as reader:
+        entire_data = reader.read().decode('utf-8')
+    num_original_tokens = len(entire_data.strip().split(' '))
+    entire_data = get_detokenizer('wikitext')(entire_data)
+    print_rank_0(entire_data[:1024])
+    tokenized_data = tokenizer.EncodeAsIds(entire_data).tokenization
+    num_tokenized_tokens = len(tokenized_data)
+
+    val_dataset = LMDataset(args, [tokenized_data], tokenizer,
+                            num_original_tokens, num_tokenized_tokens)
+    print_rank_0(' > number of original tokens: {}, number of detokenized '
+                 'tokens: {}'.format(num_original_tokens,
+                                     num_tokenized_tokens))
+    return val_dataset
diff --git a/modelscope/models/nlp/mglm/tasks/language_model/detokenizer.py b/modelscope/models/nlp/mglm/tasks/language_model/detokenizer.py
new file mode 100755
index 00000000..dc1524de
--- /dev/null
+++ b/modelscope/models/nlp/mglm/tasks/language_model/detokenizer.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2022 Zhipu.AI
+
+import re
+
+
+def ptb_detokenizer(string):
+    string = string.replace(" '", "'")
+    string = string.replace(' \n', '\n')
+    string = string.replace('\n ', '\n')
+    string = string.replace(" n't", "n't")
+    string = string.replace(' N ', '1 ')
+    string = string.replace('$ 1', '$1')
+    string = string.replace('# 1', '#1')
+    return string
+
+
+def wikitext_detokenizer(string):
+    # contractions
+    string = string.replace("s '", "s'")
+    string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string)
+    # number separators
+    string = string.replace(' @-@ ', '-')
+    string = string.replace(' @,@ ', ',')
+    string = string.replace(' @.@ ', '.')
+    # punctuation
+    string = string.replace(' : ', ': ')
+    string = string.replace(' ; ', '; ')
+    string = string.replace(' . ', '. ')
+    string = string.replace(' ! ', '! ')
+    string = string.replace(' ? ', '? ')
+    string = string.replace(' , ', ', ')
+    # double brackets
+    string = re.sub(r'\(\s*([^\)]*?)\s*\)', r'(\1)', string)
+    string = re.sub(r'\[\s*([^\]]*?)\s*\]', r'[\1]', string)
+    string = re.sub(r'{\s*([^}]*?)\s*}', r'{\1}', string)
+    string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string)
+    string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string)
+    # miscellaneous
+    string = string.replace('= = = =', '====')
+    string = string.replace('= = =', '===')
+    string = string.replace('= =', '==')
+    string = string.replace(' ' + chr(176) + ' ', chr(176))
+    string = string.replace(' \n', '\n')
+    string = string.replace('\n ', '\n')
+    string = string.replace(' N ', ' 1 ')
+    string = string.replace(" 's", "'s")
+
+    return string
+
+
+def lambada_detokenizer(string):
+    return string
+
+
+def get_detokenizer(dataset):
+    return DETOKENIZERS[dataset]
+
+
+DETOKENIZERS = {
+    'ptb': ptb_detokenizer,
+    'wikitext': wikitext_detokenizer,
+    'lambada': lambada_detokenizer,
+}
diff --git a/modelscope/models/nlp/mglm/tasks/language_model/finetune.py b/modelscope/models/nlp/mglm/tasks/language_model/finetune.py
new file mode 100644
index 00000000..b6089e6f
--- /dev/null
+++ b/modelscope/models/nlp/mglm/tasks/language_model/finetune.py
@@ -0,0 +1,254 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""GPT2 zero-shot evaluation."""
+
+import functools
+import math
+
+import mpu
+import torch
+from finetune_glm import finetune
+from pretrain_glm import get_batch
+from tasks.data_utils import build_data_loader
+from tasks.language_model.dataset import (build_lambada_dataset,
+                                          build_lm_dataset,
+                                          build_wikitext103_dataset)
+from utils import print_rank_0
+
+global_tokenizer = None
+
+
+def lm_forward_step(data, model, args, timers, mems, eval_metric=None):
+    """Forward step."""
+
+    # Get the batch.
+    if timers is not None:
+        timers('batch generator').start()
+    if 'mask' in data:
+        data['attention_mask'] = data.pop('mask')
+    tokens, labels, loss_mask, attention_mask, position_ids = get_batch(
+        data, args)
+    if timers is not None:
+        timers('batch generator').stop()
+
+    def print_masked_text(batch_id):
+        block_position_ids = position_ids[:, 1]
+        position_ids_ = position_ids[:, 0]
+        output_tokens = []
+        sep = attention_mask[batch_id].item()
+        for i, token in enumerate(tokens[batch_id, :sep].tolist()):
+            if global_tokenizer is not None:
+                token = global_tokenizer.IdToToken(token)
+                if token.startswith('[MASK'):
+                    token = f'[{position_ids_[batch_id, i].item()}, {token}]'
+                if token.startswith('##') and len(
+                        output_tokens) > 0 and not output_tokens[-1].endswith(
+                            ']'):
+                    output_tokens[-1] += token[2:]
+                else:
+                    output_tokens.append(token)
+            else:
+                output_tokens.append(str(token))
+        print(' '.join(output_tokens))
+        last_index = None
+        for i in range(sep, tokens.size(1)):
+            if global_tokenizer.IdToToken(
+                    tokens[batch_id, i].item()).startswith('<|startofpiece'):
+                if last_index is not None:
+                    print(
+                        global_tokenizer.DecodeIds(
+                            tokens[batch_id, last_index:i].tolist()), '|',
+                        global_tokenizer.DecodeIds(
+                            labels[batch_id, last_index:i].tolist())),
+                    print(position_ids_[batch_id, last_index:i].tolist(),
+                          block_position_ids[batch_id, last_index:i].tolist())
+                last_index = i
+        if last_index is not None:
+            print(
+                global_tokenizer.DecodeIds(tokens[batch_id,
+                                                  last_index:].tolist()), '|',
+                global_tokenizer.DecodeIds(labels[batch_id,
+                                                  last_index:].tolist()))
+            print(position_ids_[batch_id, last_index:].tolist(),
+                  block_position_ids[batch_id, last_index:].tolist())
+
+    # Forward model.
+    if args.continuous_prompt:
+        prompt_pos = data['prompt_pos'].long().cuda()
+        logits, *mems = model(
+            tokens, position_ids, attention_mask, *mems, prompt_pos=prompt_pos)
+    else:
+        logits, *mems = model(tokens, position_ids, attention_mask, *mems)
+
+    if eval_metric is None or eval_metric == 'loss':
+        losses = mpu.vocab_parallel_cross_entropy(logits.contiguous().float(),
+                                                  labels)
+        loss_mask = loss_mask.view(-1)
+        # The loss is not normalized for fair comparison
+        loss = torch.sum(losses.view(-1) * loss_mask)
+        if eval_metric is None:
+            loss = loss / loss_mask.sum()
+        return loss, mems, 'bert'
+    elif eval_metric == 'accuracy' or eval_metric == 'classify':
+        logits = mpu.gather_from_model_parallel_region(logits)
+        outputs = torch.argmax(logits, -1)
+        correct = (outputs == labels).float()
+        correct[(1 - loss_mask).bool()] = 1
+        correct = correct.prod(-1)
+        if eval_metric == 'accuracy':
+            correct = correct.sum()
+        return correct, mems, 'bert'
+    else:
+        raise NotImplementedError(
+            'Metric {} not implemented'.format(eval_metric))
+
+
+def classify_evaluate(model, dataloader, example_dict, args):
+    """Evaluation."""
+    # Turn on evaluation mode which disables dropout.
+    model.eval()
+    predictions, labels, examples = [], [], []
+    with torch.no_grad():
+        # For all the batches in the dataset.
+        for iteration, batch in enumerate(dataloader):
+            # Forward evaluation.
+            output, _, _ = lm_forward_step(
+                batch, model, args, None, [], eval_metric='classify')
+            uid_list = batch['uid']
+            example_batch = [example_dict[uid] for uid in uid_list]
+            predictions.extend(output.long().tolist())
+            label = batch['label'].tolist()
+            labels.extend(label)
+            examples.extend(example_batch)
+    return predictions, labels, examples
+
+
+def evaluate(model, dataloader, eval_metric, args):
+    """Evaluation."""
+    # Turn on evaluation mode which disables dropout.
+    model.eval()
+    total_output, total_count = 0.0, 0
+    total_tokens = 0
+    with torch.no_grad():
+        # For all the batches in the dataset.
+        for iteration, batch in enumerate(dataloader):
+            if (iteration + 1) % args.log_interval == 0:
+                print_rank_0('> working on iteration: {}'.format(iteration))
+            # Forward evaluation.
+            output, _, _ = lm_forward_step(
+                batch, model, args, None, [], eval_metric=eval_metric)
+            count = batch['text'].size(0)
+            count = torch.cuda.LongTensor([count])
+            # Reduce across processes.
+            torch.distributed.all_reduce(
+                output, group=mpu.get_data_parallel_group())
+            torch.distributed.all_reduce(
+                count, group=mpu.get_data_parallel_group())
+
+            total_output += output.item()
+            total_count += count.item()
+            total_tokens += batch['loss_mask'].sum().item()
+    totals = torch.cuda.FloatTensor([total_output, total_tokens])
+    torch.distributed.all_reduce(totals, group=mpu.get_data_parallel_group())
+    total_output, total_tokens = totals.tolist()
+    print(total_tokens)
+    return {eval_metric: total_output}, total_count
+
+
+def evaluate_and_print_results(data_loader, model, eval_metric, args):
+    """Evaluate and print results on screen."""
+
+    # Evaluate and get results.
+    output, _ = evaluate(model, data_loader, eval_metric, args)
+
+    string = ''
+    if eval_metric == 'loss':
+        output = output['loss']
+        num_tokenized_tokens = data_loader.dataset.num_tokenized_tokens
+        num_original_tokens = data_loader.dataset.num_original_tokens
+        val_loss = output / (num_tokenized_tokens - 1)
+        ppl = math.exp(min(20, val_loss))
+        token_ratio = (num_tokenized_tokens - 1) / (num_original_tokens - 1)
+        adjusted_ppl = math.exp(min(20, val_loss * token_ratio))
+        string += 'avg loss: {:.4E} | '.format(val_loss)
+        string += 'ppl: {:.4E} | '.format(ppl)
+        string += 'adjusted ppl: {:.4E} | '.format(adjusted_ppl)
+        string += 'token ratio: {} |'.format(token_ratio)
+        score_dict = {
+            'avg loss': val_loss,
+            'ppl': ppl,
+            'adjusted ppl': adjusted_ppl
+        }
+
+    elif eval_metric == 'accuracy':
+        output = output['accuracy']
+        num_examples = len(data_loader.dataset)
+        acc = output / num_examples * 100
+        string += 'number correct: {} | '.format(output)
+        string += 'total examples: {} | '.format(num_examples)
+        string += 'avg accuracy: {:.2f}'.format(acc)
+        score_dict = {'accuracy': acc}
+    else:
+        raise NotImplementedError('evaluation method for {} metric is not '
+                                  'implemented yet.'.format(eval_metric))
+
+    length = len(string) + 1
+    print_rank_0('-' * length)
+    print_rank_0(string)
+    print_rank_0('-' * length)
+    return score_dict
+
+
+def metrics_func_provider(args, tokenizer, is_test):
+    """Privde metrics callback function."""
+
+    if args.task.lower() == 'lambda':
+        eval_metric = 'accuracy'
+        dataset = build_lambada_dataset(tokenizer, args)
+    elif args.task == 'wikitext':
+        eval_metric = 'loss'
+        dataset = build_wikitext103_dataset(tokenizer, args)
+    elif args.task == 'language_model':
+        eval_metric = 'loss'
+        dataset = build_lm_dataset(tokenizer, args)
+    else:
+        raise NotImplementedError('{} task is not implemented.'.format(
+            args.task))
+    # Data stuff
+    dataloader = build_data_loader(
+        dataset,
+        args.eval_batch_size,
+        args.num_workers,
+        drop_last=False,
+        shuffle=False)
+
+    def metrics_func(model,
+                     epoch,
+                     output_predictions=False,
+                     summary_writer=None):
+        return evaluate_and_print_results(
+            dataloader, model, eval_metric=eval_metric, args=args)
+
+    global global_tokenizer
+    global_tokenizer = tokenizer
+    return metrics_func
+
+
+def main(args):
+    """Main program."""
+    finetune(
+        args,
+        None, {},
+        end_of_epoch_callback_provider=metrics_func_provider,
+        forward_step=lm_forward_step)
diff --git a/modelscope/models/nlp/mglm/tasks/seq2seq/dataset.py b/modelscope/models/nlp/mglm/tasks/seq2seq/dataset.py
new file mode 100644
index 00000000..6a4e275f
--- /dev/null
+++ b/modelscope/models/nlp/mglm/tasks/seq2seq/dataset.py
@@ -0,0 +1,667 @@
+# Copyright (c) 2022 Zhipu.AI
+
+import os
+import random
+
+import json
+import numpy as np
+import torch
+import torch.utils.data
+from data_utils.corpora import punctuation_standardization
+from tasks.data_utils import InputExample
+from tqdm import tqdm
+from utils import print_rank_0
+
+
+def gigaword_detokenize(string, is_target=False):
+    _tok_dict = {
+        '(': '-lrb-',
+        ')': '-rrb-',
+        '[': '-lsb-',
+        ']': '-rsb-',
+        '{': '-lcb-',
+        '}': '-rcb-',
+        '&': '&amp;',
+        '<': '&lt;',
+        '>': '&gt;'
+    }
+    string = string.replace('UNK', '[UNK]')
+    string = string.replace('<unk>', '[UNK]')
+    for key, value in _tok_dict.items():
+        string = string.replace(value, key)
+    # string = string.replace("''", "\"")
+    # string = string.replace("``", "\"")
+    # string = string.replace("`", "'")
+    # string = string.replace(" n't", "n't")
+    # string = string.replace(" 's", "'s")
+    # string = string.replace(" 'd", "'d")
+    # string = string.replace(" 'll", "'ll")
+    return string
+
+
+def cnndm_detokenize(string, is_target=False):
+    _tok_dict = {
+        '(': '-LRB-',
+        ')': '-RRB-',
+        '[': '-LSB-',
+        ']': '-RSB-',
+        '{': '-LCB-',
+        '}': '-RCB-'
+    }
+    if not is_target:
+        string = string.replace('<S_SEP>', '')
+    else:
+        string = string.replace('<S_SEP>', '[SEP]')
+    for key, value in _tok_dict.items():
+        string = string.replace(value, key)
+    string = string.replace("''", "\"")
+    string = string.replace('``', "\"")
+    string = string.replace('`', "'")
+    string = string.replace(" n't", "n't")
+    string = string.replace(" 's", "'s")
+    string = string.replace(" 'd", "'d")
+    string = string.replace(" 'll", "'ll")
+    return string
+
+
+def blanklm_detokenize(string, is_target=False):
+    string = string.replace('_UNK', '[UNK]')
+    string = string.replace('<blank>', '[MASK]')
+    return string
+
+
+class SummmaryProcessor:
+
+    def __init__(self, task, data_dir, tokenizer):
+        self.task = task
+        self.data_dir = data_dir
+        self.tokenizer = tokenizer
+
+    def create_examples(self, split):
+        if split == 'train':
+            filename = 'train'
+        elif split == 'dev':
+            filename = 'val'
+        elif split == 'test':
+            filename = 'test'
+        else:
+            raise NotImplementedError(split)
+        print_rank_0(
+            f'Creating {self.task}-{split} dataset from {self.data_dir}')
+        if self.task == 'gigaword':
+            detokenizer = gigaword_detokenize
+        elif self.task == 'cnn_dm':
+            detokenizer = cnndm_detokenize
+        else:
+            detokenizer = None
+        source_texts, target_texts = [], []
+        with open(
+                os.path.join(self.data_dir, f'{filename}.source'),
+                encoding='utf-8') as file:
+            for line in file:
+                line = line.strip()
+                line = punctuation_standardization(line)
+                line = detokenizer(line) if detokenizer else line
+                source_texts.append(line)
+        with open(
+                os.path.join(self.data_dir, f'{filename}.target'),
+                encoding='utf-8') as file:
+            for line in file:
+                line = line.strip()
+                line = punctuation_standardization(line)
+                line = detokenizer(
+                    line, is_target=True) if detokenizer else line
+                target_texts.append(line)
+        assert len(source_texts) == len(target_texts)
+        example_list = []
+        for idx, (source_text,
+                  target_text) in enumerate(zip(source_texts, target_texts)):
+            if (idx + 1) % 20000 == 0:
+                print_rank_0(f'Complete {idx + 1} examples')
+            guid = '%s-%s' % (split, idx)
+            meta = {
+                'ref':
+                self.tokenizer.DecodeIds(
+                    self.tokenizer.EncodeAsIds(target_text).tokenization)
+            }
+            example = InputExample(
+                guid=guid, text_a=source_text, text_b=target_text, meta=meta)
+            if idx < 10:
+                print_rank_0(
+                    (source_text.encode('utf-8'), target_text.encode('utf-8'),
+                     meta['ref'].encode('utf-8')))
+            example_list.append(example)
+        return example_list
+
+
+class SQuADProcessor:
+
+    def __init__(self, data_dir, tokenizer):
+        self.data_dir = data_dir
+        self.tokenizer = tokenizer
+
+    def create_examples(self, split):
+        if split == 'train':
+            filename = 'train.json'
+        elif split == 'dev':
+            filename = 'dev.json'
+        elif split == 'test':
+            filename = 'test.json'
+        else:
+            raise NotImplementedError(split)
+        print_rank_0(f'Creating SQuAD-{split} dataset from {self.data_dir}')
+        example_list = []
+        idx = 0
+        with open(
+                os.path.join(self.data_dir, filename),
+                encoding='utf-8') as file:
+            dataset = json.load(file)
+            for paragraphs in dataset:
+                for paragraph in paragraphs['paragraphs']:
+                    context = paragraph['context']
+                    for qa in paragraph['qas']:
+                        question = qa['question']
+                        answers = {answer['text'] for answer in qa['answers']}
+                        answer_starts = {
+                            answer['text']: answer['answer_start']
+                            for answer in qa['answers']
+                        }
+                        for answer in answers:
+                            guid = '%s-%s' % (split, idx)
+                            meta = {
+                                'answer_start':
+                                answer_starts[answer],
+                                'answer':
+                                answer,
+                                'question':
+                                question,
+                                'ref':
+                                self.tokenizer.DecodeIds(
+                                    self.tokenizer.EncodeAsIds(
+                                        question).tokenization)
+                            }
+                            example = InputExample(
+                                guid=guid, text_a=context, meta=meta)
+                            if idx < 10:
+                                print_rank_0((context.encode('utf-8'),
+                                              answer.encode('utf-8'),
+                                              meta['ref'].encode('utf-8')))
+                            example_list.append(example)
+                            idx += 1
+        print_rank_0(f'Creating {len(example_list)} examples for {split}')
+        return example_list
+
+
+class XSumProcessor:
+
+    def __init__(self, data_dir, tokenizer):
+        self.data_dir = data_dir
+        self.tokenizer = tokenizer
+
+    def create_examples(self, split):
+        if split == 'train':
+            key = 'train'
+        elif split == 'dev':
+            key = 'validation'
+        elif split == 'test':
+            key = 'test'
+        else:
+            raise NotImplementedError(split)
+        print_rank_0(f'Creating XSUM-{split} dataset from {self.data_dir}')
+        with open(
+                os.path.join(
+                    self.data_dir,
+                    'XSum-TRAINING-DEV-TEST-SPLIT-90-5-5.json')) as file:
+            id_list = json.load(file)
+        id_list = id_list[key]
+        source_texts, target_texts = [], []
+        for i, idx in enumerate(id_list):
+            with open(os.path.join(self.data_dir, f'{idx}.summary')) as file:
+                key, sentences = None, []
+                source_text, target_text = None, None
+                for line in file:
+                    line = line.strip()
+                    if line.startswith('[SN]'):
+                        if key is not None:
+                            if key == 'RESTBODY':
+                                source_text = ' '.join(sentences)
+                            elif key == 'FIRST-SENTENCE':
+                                target_text = ' '.join(sentences)
+                        key = line[4:-4]
+                        sentences = []
+                    elif line:
+                        sentences.append(line)
+                if key is not None:
+                    if key == 'RESTBODY':
+                        source_text = ' '.join(sentences)
+                    elif key == 'FIRST-SENTENCE':
+                        target_text = ' '.join(sentences)
+                source_texts.append(source_text)
+                target_texts.append(target_text)
+                if (i + 1) % 1000 == 0:
+                    print_rank_0(f'Complete {i + 1} examples')
+        assert len(source_texts) == len(target_texts)
+        example_list = []
+        for idx, (source_text,
+                  target_text) in enumerate(zip(source_texts, target_texts)):
+            if (idx + 1) % 20000 == 0:
+                print_rank_0(f'Complete {idx + 1} examples')
+            guid = '%s-%s' % (split, idx)
+            meta = {
+                'ref':
+                self.tokenizer.DecodeIds(
+                    self.tokenizer.EncodeAsIds(target_text).tokenization)
+            }
+            example = InputExample(
+                guid=guid, text_a=source_text, text_b=target_text, meta=meta)
+            if idx < 10:
+                print_rank_0(
+                    (source_text.encode('utf-8'), target_text.encode('utf-8'),
+                     meta['ref'].encode('utf-8')))
+            example_list.append(example)
+        return example_list
+
+
+class Seq2SeqDataset(torch.utils.data.Dataset):
+
+    def __init__(self, args, split, tokenizer):
+        self.args = args
+        self.task, self.data_dir = args.task.lower(), args.data_dir
+        self.max_src_length, self.max_tgt_length = args.src_seq_length, args.tgt_seq_length
+        self.split = split
+        self.tokenizer = tokenizer
+        self.dataset_name = split
+        if self.task in ['gigaword', 'cnn_dm', 'cnn_dm_original']:
+            self.processor = SummmaryProcessor(self.task, self.data_dir,
+                                               tokenizer)
+        elif self.task in ['xsum']:
+            self.processor = XSumProcessor(self.data_dir, tokenizer)
+        elif self.task in ['squad_generation']:
+            self.processor = SQuADProcessor(self.data_dir, tokenizer)
+        else:
+            raise NotImplementedError
+        example_list = self.processor.create_examples(split)
+        self.example_list = example_list
+        self.examples = {example.guid: example for example in example_list}
+
+        print_rank_0(f'Return {len(self.examples)} {split} examples')
+
+    def __len__(self):
+        return len(self.example_list)
+
+    def __getitem__(self, idx):
+        example = self.example_list[idx]
+        cls_id = self.tokenizer.get_command('ENC').Id
+        mask_token = 'sMASK' if self.args.task_mask else 'MASK'
+        mask_id = self.tokenizer.get_command(mask_token).Id
+        pad_id = self.tokenizer.get_command('pad').Id
+        sop_id = self.tokenizer.get_command('sop').Id
+        eop_id = self.tokenizer.get_command('eop').Id
+        if self.task in ['gigaword', 'cnn_dm', 'cnn_dm_original', 'xsum']:
+            source_text, target_text = example.text_a, example.text_b
+            source_tokens = self.tokenizer.EncodeAsIds(
+                ' ' + source_text).tokenization
+            prompt = [cls_id, mask_id
+                      ] + self.tokenizer.EncodeAsIds(' Content:').tokenization
+            if len(source_tokens) > self.max_src_length - len(prompt):
+                source_tokens = source_tokens[:self.max_src_length
+                                              - len(prompt)]
+            source_tokens = prompt + source_tokens
+        elif self.task == 'squad_generation':
+            source_text = example.text_a
+            target_text, answer = example.meta['question'], example.meta[
+                'answer']
+            source_tokens = self.tokenizer.EncodeAsIds(
+                source_text.rstrip() + ' Question:').tokenization
+            answer_tokens = self.tokenizer.EncodeAsIds(' Answer: '
+                                                       + answer).tokenization
+            if len(source_tokens
+                   ) > self.max_src_length - len(answer_tokens) - 2:
+                max_src_length = self.max_src_length - len(answer_tokens) - 2
+                answer_pattern = self.tokenizer.EncodeAsIds(
+                    ' ' + answer).tokenization
+
+                def sub_finder(mylist, pattern):
+                    matches = []
+                    for i in range(len(mylist)):
+                        if mylist[i] == pattern[0] and mylist[
+                                i:i + len(pattern)] == pattern:
+                            matches.append(i)
+                    return matches
+
+                answer_indices = sub_finder(source_tokens, answer_pattern)
+                if len(answer_indices) == 0:
+                    print(f'Answer {answer} not exists in the source text')
+                    source_tokens = source_tokens[:max_src_length]
+                else:
+                    start_index = max(answer_indices[0] - max_src_length // 2,
+                                      0)
+                    source_tokens = source_tokens[start_index:start_index
+                                                  + max_src_length]
+            source_tokens = [cls_id] + source_tokens + [mask_id
+                                                        ] + answer_tokens
+        else:
+            raise NotImplementedError
+        if len(source_tokens) < self.max_src_length:
+            source_tokens = source_tokens + [pad_id] * (
+                self.max_src_length - len(source_tokens))
+        sep = len(source_tokens)
+        position_ids = list(range(len(source_tokens)))
+        block_position_ids = [0] * len(source_tokens)
+        mask_pos = source_tokens.index(mask_id)
+        if self.split == 'train':
+            target_tokens = self.tokenizer.EncodeAsIds(
+                ' ' + target_text).tokenization
+            target_tokens = target_tokens + [eop_id]
+            if len(target_tokens) > self.max_tgt_length:
+                target_tokens = target_tokens[:self.max_tgt_length]
+            loss_mask = [1] * len(target_tokens)
+            if len(target_tokens) < self.max_tgt_length:
+                loss_mask += [0] * (self.max_tgt_length - len(target_tokens))
+                target_tokens += [pad_id] * (
+                    self.max_tgt_length - len(target_tokens))
+            tokens = source_tokens + [sop_id] + target_tokens[:-1]
+            loss_mask = [0] * len(source_tokens) + loss_mask
+            target_ids = [0] * len(source_tokens) + target_tokens
+            position_ids += [mask_pos] * len(target_tokens)
+            if self.args.no_block_position:
+                block_position_ids += [1] * len(target_tokens)
+            else:
+                block_position_ids += list(range(1, len(target_tokens) + 1))
+            position_ids = [position_ids, block_position_ids]
+            sample = {
+                'text': np.array(tokens, dtype=np.int64),
+                'target': np.array(target_ids, dtype=np.int64),
+                'attention_mask': np.array(sep, dtype=np.int64),
+                'loss_mask': np.array(loss_mask, dtype=np.int64),
+                'position_id': np.array(position_ids, dtype=np.int64),
+                'uid': example.guid
+            }
+        else:
+            tokens = source_tokens + [sop_id]
+            position_ids = position_ids + [mask_pos]
+            block_position_ids = block_position_ids + [1]
+            position_ids = [position_ids, block_position_ids]
+            sample = {
+                'text': np.array(tokens, dtype=np.int64),
+                'attention_mask': np.array(sep, dtype=np.int64),
+                'position_id': np.array(position_ids, dtype=np.int64),
+                'uid': example.guid
+            }
+        return sample
+
+
+class ExtractionDataset(torch.utils.data.Dataset):
+
+    def __init__(self, args, split, tokenizer):
+        self.args = args
+        task, data_dir = args.task.lower(), args.data_dir
+        self.max_src_length, self.max_tgt_length = args.src_seq_length, args.tgt_seq_length
+        self.split = split
+        self.tokenizer = tokenizer
+        if split == 'train':
+            filename = 'train'
+        elif split == 'dev':
+            filename = 'valid'
+        elif split == 'test':
+            filename = 'test'
+        else:
+            raise NotImplementedError(split)
+        print_rank_0(f'Creating {task}-{split} dataset from {data_dir}')
+        self.dataset_name = split
+        source_texts, target_texts = [], []
+        with open(
+                os.path.join(data_dir, f'{filename}.source'),
+                encoding='utf-8') as file:
+            for line in file:
+                line = line.strip()
+                source_texts.append(line)
+        with open(
+                os.path.join(data_dir, f'{filename}.target'),
+                encoding='utf-8') as file:
+            for line in file:
+                line = line.strip()
+                target_texts.append(line)
+        self.examples, self.example_list = {}, []
+        for idx, (source_text,
+                  target_text) in enumerate(zip(source_texts, target_texts)):
+            if (idx + 1) % 20000 == 0:
+                print_rank_0(f'Complete {idx + 1} examples')
+            guid = '%s-%s' % (split, idx)
+            meta = {'ref': target_text}
+            example = InputExample(
+                guid=guid, text_a=source_text, text_b=target_text, meta=meta)
+            self.examples[guid] = example
+            self.example_list.append(example)
+        print_rank_0(f'Return {len(self.examples)} {split} examples')
+
+    def __len__(self):
+        return len(self.example_list)
+
+    def __getitem__(self, idx):
+        example = self.example_list[idx]
+        source_text, target_text = example.text_a, example.text_b
+        mask_token = 'MASK'
+        mask_id = self.tokenizer.get_command(mask_token).Id
+        sop_id = self.tokenizer.get_command('sop').Id
+        eop_id = self.tokenizer.get_command('eop').Id
+        pad_id = self.tokenizer.get_command('pad').Id
+
+        def pad_to(text, max_len, pad_id):
+            if len(text) > max_len:
+                text = text[:max_len]
+            else:
+                text = text + [pad_id] * (max_len - len(text))
+            return text
+
+        source_tokens = self.tokenizer.EncodeAsIds(source_text).tokenization
+        masked_tgt = target_text.split('|')
+        source_tokens = pad_to(source_tokens, self.max_src_length, pad_id)
+        sep = len(source_tokens)
+        position_ids = list(range(len(source_tokens)))
+        block_position_ids = [0] * len(source_tokens)
+        if self.split == 'train':
+            mask_positions = [
+                i for i, x in enumerate(source_tokens) if x == mask_id
+            ]
+            assert len(mask_positions) <= len(masked_tgt)
+            tokens = source_tokens
+            target_ids = [0] * len(source_tokens)
+            loss_mask = [0] * len(source_tokens)
+            for i, mask_pos in enumerate(mask_positions):
+                tgt_text = masked_tgt[i]
+                tgt_tokens = self.tokenizer.EncodeAsIds(
+                    ' ' + tgt_text).tokenization
+                tokens += [sop_id] + tgt_tokens
+                target_ids += tgt_tokens + [eop_id]
+                loss_mask += [1] * (len(tgt_tokens) + 1)
+                position_ids += [mask_pos] * (len(tgt_tokens) + 1)
+                block_position_ids += [
+                    i + 1 for i in range(len(tgt_tokens) + 1)
+                ]
+            tokens = pad_to(tokens, self.max_src_length + self.max_tgt_length,
+                            pad_id)
+            target_ids = pad_to(target_ids,
+                                self.max_src_length + self.max_tgt_length,
+                                pad_id)
+            loss_mask = pad_to(loss_mask,
+                               self.max_src_length + self.max_tgt_length, 0)
+            position_ids = pad_to(position_ids,
+                                  self.max_src_length + self.max_tgt_length, 0)
+            block_position_ids = pad_to(
+                block_position_ids, self.max_src_length + self.max_tgt_length,
+                0)
+            position_ids = [position_ids, block_position_ids]
+            sample = {
+                'text': np.array(tokens, dtype=np.int64),
+                'target': np.array(target_ids, dtype=np.int64),
+                'attention_mask': np.array(sep, dtype=np.int64),
+                'loss_mask': np.array(loss_mask, dtype=np.int64),
+                'position_id': np.array(position_ids, dtype=np.int64),
+                'uid': example.guid
+            }
+        else:
+            tokens = source_tokens + [sop_id]
+            mask_pos = source_tokens.index(mask_id)
+            position_ids = position_ids + [mask_pos]
+            block_position_ids = block_position_ids + [1]
+            position_ids = [position_ids, block_position_ids]
+            sample = {
+                'text': np.array(tokens, dtype=np.int64),
+                'attention_mask': np.array(sep, dtype=np.int64),
+                'position_id': np.array(position_ids, dtype=np.int64),
+                'uid': example.guid
+            }
+        return sample
+
+
+class BlankLMDataset(torch.utils.data.Dataset):
+
+    def __init__(self, args, split, tokenizer):
+        self.args = args
+        task, data_dir = args.task.lower(), args.data_dir
+        self.max_src_length, self.max_tgt_length = args.src_seq_length, args.tgt_seq_length
+        self.split = split
+        assert args.tokenizer_type == 'BertWordPieceTokenizer'
+        self.tokenizer = tokenizer
+        if split == 'train':
+            filename = 'train'
+        elif split == 'dev':
+            filename = 'valid'
+        elif split == 'test':
+            filename = 'test'
+        else:
+            raise NotImplementedError(split)
+        print_rank_0(f'Creating {task}-{split} dataset from {data_dir}')
+        self.dataset_name = split
+        detokenizer = blanklm_detokenize
+        source_texts, target_texts = [], []
+        with open(
+                os.path.join(data_dir, f'{filename}.txt'),
+                encoding='utf-8') as file:
+            for line in file:
+                line = line.strip()
+                line = detokenizer(line) if detokenizer else line
+                target_texts.append(line)
+        if split == 'test':
+            with open(
+                    os.path.join(
+                        data_dir,
+                        f'blank/test.maskratio{args.blank_maskratio:.1f}.blank'
+                    ),
+                    encoding='utf-8') as file:
+                for line in file:
+                    line = line.strip()
+                    line = detokenizer(line) if detokenizer else line
+                    source_texts.append(line)
+        else:
+            source_texts = target_texts
+        self.examples, self.example_list = {}, []
+        for idx, (source_text,
+                  target_text) in enumerate(zip(source_texts, target_texts)):
+            # if idx > 10000:
+            #     break
+            if (idx + 1) % 20000 == 0:
+                print_rank_0(f'Complete {idx + 1} examples')
+            guid = '%s-%s' % (split, idx)
+            meta = {'ref': target_text}
+            example = InputExample(
+                guid=guid, text_a=source_text, text_b=target_text, meta=meta)
+            self.examples[guid] = example
+            self.example_list.append(example)
+        print_rank_0(f'Return {len(self.examples)} {split} examples')
+        self.random = random.Random(args.seed)
+
+    def __len__(self):
+        return len(self.example_list)
+
+    def __getitem__(self, idx):
+        example = self.example_list[idx]
+        source_text, target_text = example.text_a, example.text_b  # noqa
+        mask_token = 'gMASK' if self.args.task_mask else 'MASK'
+        mask_id = self.tokenizer.get_command(mask_token).Id
+        sop_id = self.tokenizer.get_command('sop').Id
+        eop_id = self.tokenizer.get_command('eop').Id
+        pad_id = self.tokenizer.get_command('pad').Id
+        if self.split in ['train', 'dev']:
+            masked_src, masked_tgt = self.mask_text(source_text)
+            source_text = masked_src
+
+        def pad_to(text, max_len, pad_id):
+            if len(text) > max_len:
+                text = text[:max_len]
+            else:
+                text = text + [pad_id] * (max_len - len(text))
+            return text
+
+        source_tokens = self.tokenizer.EncodeAsIds(' '
+                                                   + source_text).tokenization
+        source_tokens = pad_to(source_tokens, self.max_src_length, pad_id)
+        sep = len(source_tokens)
+        position_ids = list(range(len(source_tokens)))
+        block_position_ids = [0] * len(source_tokens)
+        if self.split in ['train', 'dev']:
+            mask_positions = [
+                i for i, x in enumerate(source_tokens) if x == mask_id
+            ]
+            assert len(mask_positions) <= len(masked_tgt)
+            tokens = source_tokens
+            target_ids = [0] * len(source_tokens)
+            loss_mask = [0] * len(source_tokens)
+            for i, mask_pos in enumerate(mask_positions):
+                tgt_text = masked_tgt[i]
+                tgt_tokens = self.tokenizer.EncodeAsIds(
+                    ' ' + tgt_text).tokenization
+                tokens += [sop_id] + tgt_tokens
+                target_ids += tgt_tokens + [eop_id]
+                loss_mask += [1] * (len(tgt_tokens) + 1)
+                position_ids += [mask_pos] * (len(tgt_tokens) + 1)
+                block_position_ids += [
+                    i + 1 for i in range(len(tgt_tokens) + 1)
+                ]
+            max_length = self.max_src_length + int(
+                self.max_src_length * self.args.blank_maskratio)
+            tokens = pad_to(tokens, max_length, pad_id)
+            target_ids = pad_to(target_ids, max_length, pad_id)
+            loss_mask = pad_to(loss_mask, max_length, 0)
+            position_ids = pad_to(position_ids, max_length, 0)
+            block_position_ids = pad_to(block_position_ids, max_length, 0)
+            position_ids = [position_ids, block_position_ids]
+            sample = {
+                'text': np.array(tokens, dtype=np.int64),
+                'target': np.array(target_ids, dtype=np.int64),
+                'attention_mask': np.array(sep, dtype=np.int64),
+                'loss_mask': np.array(loss_mask, dtype=np.int64),
+                'position_id': np.array(position_ids, dtype=np.int64),
+                'uid': example.guid
+            }
+        else:
+            tokens = source_tokens + [sop_id]
+            mask_pos = source_tokens.index(mask_id)
+            position_ids = position_ids + [mask_pos]
+            block_position_ids = block_position_ids + [1]
+            position_ids = [position_ids, block_position_ids]
+            sample = {
+                'text': np.array(tokens, dtype=np.int64),
+                'attention_mask': np.array(sep, dtype=np.int64),
+                'position_id': np.array(position_ids, dtype=np.int64),
+                'uid': example.guid
+            }
+        return sample
+
+    def mask_text(self, text):
+        tokens = text.split()
+        mask_ratio = self.args.blank_maskratio
+        n = len(tokens)
+        indices = sorted(self.random.sample(range(n), int(n * mask_ratio)))
+        masked_src, masked_tgt = '', []
+        for i, idx in enumerate(indices):
+            if i == 0 or idx != indices[i - 1] + 1:
+                masked_tgt.append('')
+            masked_tgt[-1] += ' ' + tokens[idx]
+            tokens[idx] = '[MASK]'
+        for i, token in enumerate(tokens):
+            if i != 0 and token == '[MASK]' and tokens[i - 1] == '[MASK]':
+                continue
+            masked_src += ' ' + token
+        return masked_src, masked_tgt
diff --git a/modelscope/models/nlp/mglm/tasks/seq2seq/evaluate.py b/modelscope/models/nlp/mglm/tasks/seq2seq/evaluate.py
new file mode 100644
index 00000000..5fd28b89
--- /dev/null
+++ b/modelscope/models/nlp/mglm/tasks/seq2seq/evaluate.py
@@ -0,0 +1,538 @@
+# Copyright (c) 2022 Zhipu.AI
+
+import datetime
+import random
+import string
+
+import mpu
+import torch
+import torch.nn.functional as F
+from generation_utils import (BeamSearchScorer, LogitsProcessorList,
+                              MinLengthLogitsProcessor,
+                              NoRepeatNGramLogitsProcessor)
+from rouge_score import rouge_scorer
+from utils import print_rank_0
+
+
+def _is_digit(w):
+    for ch in w:
+        if not (ch.isdigit() or ch == ','):
+            return False
+    return True
+
+
+gigaword_tok_dict = {
+    '(': '-lrb-',
+    ')': '-rrb-',
+    '[': '-lsb-',
+    ']': '-rsb-',
+    '{': '-lcb-',
+    '}': '-rcb-',
+    '[UNK]': 'UNK',
+    '&': '&amp;',
+    '<': '&lt;',
+    '>': '&gt;'
+}
+
+cnndm_tok_dict = {
+    '(': '-LRB-',
+    ')': '-RRB-',
+    '[': '-LSB-',
+    ']': '-RSB-',
+    '{': '-LCB-',
+    '}': '-RCB-'
+}
+
+
+def fix_tokenization(text, dataset):
+    if dataset == 'cnn_dm_org':
+        return text
+    if dataset == 'gigaword':
+        text = text.replace('[UNK]', 'UNK')
+        return text
+    input_tokens = text.split()
+    output_tokens = []
+    has_left_quote = False
+    has_left_single_quote = False
+
+    i = 0
+    prev_dash = False
+    while i < len(input_tokens):
+        tok = input_tokens[i]
+        flag_prev_dash = False
+        if tok == "\"":
+            if has_left_quote:
+                output_tokens.append("''")
+            else:
+                output_tokens.append('``')
+            has_left_quote = not has_left_quote
+            i += 1
+        elif tok == "'" and len(
+                output_tokens) > 0 and output_tokens[-1].endswith(
+                    'n') and i < len(input_tokens) - 1 and input_tokens[
+                        i + 1] == 't':  # noqa
+            output_tokens[-1] = output_tokens[-1][:-1]
+            output_tokens.append("n't")
+            i += 2
+        elif tok == "'" and i < len(input_tokens) - 1 and input_tokens[
+                i + 1] in ('s', 'd', 'll'):
+            output_tokens.append("'" + input_tokens[i + 1])
+            i += 2
+        elif tok == "'":
+            if has_left_single_quote:
+                output_tokens.append("'")
+            else:
+                output_tokens.append('`')
+            has_left_single_quote = not has_left_single_quote
+            i += 1
+        elif tok == '.' and i < len(input_tokens) - 2 and input_tokens[
+                i + 1] == '.' and input_tokens[i + 2] == '.':
+            output_tokens.append('...')
+            i += 3
+        elif tok == ',' and len(output_tokens) > 0 and _is_digit(
+                output_tokens[-1]) and i < len(input_tokens) - 1 and _is_digit(
+                    input_tokens[i + 1]):
+            # $ 3 , 000 -> $ 3,000
+            output_tokens[-1] += ',' + input_tokens[i + 1]
+            i += 2
+        elif tok == '.' and len(output_tokens) > 0 and output_tokens[-1].isdigit() and i < len(input_tokens) - 1 and \
+                input_tokens[i + 1].isdigit():
+            # 3 . 03 -> $ 3.03
+            output_tokens[-1] += '.' + input_tokens[i + 1]
+            i += 2
+        elif tok == '.' and len(output_tokens) > 0 and len(
+                output_tokens[-1]) == 1 and output_tokens[-1].isalpha(  # noqa
+                ) and i < len(input_tokens) - 2 and len(  # noqa
+                    input_tokens[i + 1]) == 1 and input_tokens[
+                        i + 1].isalpha(  # noqa
+                        ) and input_tokens[i + 2] == '.':  # noqa
+            # U . N . -> U.N.
+            k = i + 3
+            while k + 2 < len(input_tokens):
+                if len(input_tokens[k + 1]) == 1 and input_tokens[
+                        k + 1].isalpha() and input_tokens[k + 2] == '.':
+                    k += 2
+                else:
+                    break
+            output_tokens[-1] += ''.join(input_tokens[i:k])
+            i = k
+        elif tok == '-':
+            if i < len(input_tokens) - 1 and input_tokens[i + 1] == '-':
+                output_tokens.append('--')
+                i += 2
+            elif i == len(input_tokens) - 1 or i == 0:
+                output_tokens.append('-')
+                i += 1
+            elif output_tokens[-1] not in string.punctuation and input_tokens[
+                    i + 1][0] not in string.punctuation:
+                output_tokens[-1] += '-'
+                i += 1
+                flag_prev_dash = True
+            else:
+                output_tokens.append('-')
+                i += 1
+        elif prev_dash and len(
+                output_tokens) > 0 and tok[0] not in string.punctuation:
+            output_tokens[-1] += tok
+            i += 1
+        else:
+            output_tokens.append(tok)
+            i += 1
+        prev_dash = flag_prev_dash
+    return ' '.join(output_tokens)
+
+
+def count_tokens(tokens):
+    counter = {}
+    for t in tokens:
+        if t in counter.keys():
+            counter[t] += 1
+        else:
+            counter[t] = 1
+    return counter
+
+
+def get_f1(text_a, text_b):
+    tokens_a = text_a.lower().split()
+    tokens_b = text_b.lower().split()
+    if len(tokens_a) == 0 or len(tokens_b) == 0:
+        return 1 if len(tokens_a) == len(tokens_b) else 0
+    set_a = count_tokens(tokens_a)
+    set_b = count_tokens(tokens_b)
+    match = 0
+    for token in set_a.keys():
+        if token in set_b.keys():
+            match += min(set_a[token], set_b[token])
+    p = match / len(tokens_a)
+    r = match / len(tokens_b)
+    return 2.0 * p * r / (p + r + 1e-5)
+
+
+def remove_duplicate(l_list, duplicate_rate):
+    tk_list = [l.lower().split() for l in l_list]  # noqa
+    r_list = []
+    history_set = set()
+    for i, w_list in enumerate(tk_list):
+        w_set = set(w_list)
+        if len(w_set & history_set) / len(w_set) <= duplicate_rate:
+            r_list.append(l_list[i])
+        history_set |= w_set
+    return r_list
+
+
+def rouge_metric(predictions,
+                 labels,
+                 examples,
+                 metric='rouge-1',
+                 duplicate_rate=0.7,
+                 dataset='cnn_dm'):
+    metric_dict = {
+        'rouge-1': 'rouge1',
+        'rouge-2': 'rouge2',
+        'rouge-l': 'rougeLsum'
+    }
+    refs = [example.meta['ref'] for example in examples]
+    ref_list = []
+    for ref in refs:
+        ref = ref.strip().split('[SEP]')
+        ref = [fix_tokenization(sentence, dataset=dataset) for sentence in ref]
+        ref = '\n'.join(ref)
+        ref_list.append(ref)
+    pred_list = []
+    for prediction in predictions:
+        buf = []
+        for sentence in prediction.strip().split('[SEP]'):
+            sentence = fix_tokenization(sentence, dataset=dataset)
+            if any(get_f1(sentence, s) > 1.0 for s in buf):
+                continue
+            s_len = len(sentence.split())
+            if s_len <= 4:
+                continue
+            buf.append(sentence)
+        if duplicate_rate and duplicate_rate < 1:
+            buf = remove_duplicate(buf, duplicate_rate)
+        line = '\n'.join(buf)
+        pred_list.append(line)
+    if torch.distributed.get_rank() == 0:
+        import json
+        with open('./results.json', 'w') as output:
+            for ref, pred in zip(ref_list, pred_list):
+                output.write(json.dumps({'ref': ref, 'pred': pred}) + '\n')
+    scorer = rouge_scorer.RougeScorer([metric_dict[metric]], use_stemmer=True)
+    scores = [
+        scorer.score(pred, ref) for pred, ref in zip(pred_list, ref_list)
+    ]
+    scores = [score[metric_dict[metric]].fmeasure for score in scores]
+    scores = sum(scores) / len(scores)
+    return scores
+
+
+def process_batch(batch, args):
+    """Process batch and produce inputs for the model."""
+    tokens = batch['text'].long().cuda()
+    attention_mask = batch['attention_mask'].long().cuda()
+    position_ids = batch['position_id'].long().cuda()
+    return tokens, attention_mask, position_ids
+
+
+class DecoderEvaluater:
+
+    def __init__(self, args, tokenizer):
+        self.tokenizer = tokenizer
+        self.start_token = tokenizer.get_command('sop').Id
+        self.end_token = tokenizer.get_command('eop').Id
+        self.mask_token = tokenizer.get_command(
+            'sMASK').Id if args.task_mask else tokenizer.get_command('MASK').Id
+        self.pad_token = tokenizer.get_command('pad').Id
+        self.processors = LogitsProcessorList()
+        if args.min_tgt_length > 0:
+            processor = MinLengthLogitsProcessor(args.min_tgt_length,
+                                                 self.end_token)
+            self.processors.append(processor)
+        if args.no_repeat_ngram_size > 0:
+            processor = NoRepeatNGramLogitsProcessor(args.no_repeat_ngram_size)
+            self.processors.append(processor)
+
+    def evaluate(self, model, dataloader, example_dict, args):
+        """Calculate correct over total answers and return prediction if the
+        `output_predictions` is true."""
+        model.eval()
+        store = torch.distributed.TCPStore(args.master_ip,
+                                           18931 + random.randint(0, 10000),
+                                           mpu.get_data_parallel_world_size(),
+                                           torch.distributed.get_rank() == 0,
+                                           datetime.timedelta(seconds=30))
+        print_rank_0('Distributed store created')
+        with torch.no_grad():
+            # For all the batches in the dataset.
+            for idx, data in enumerate(dataloader):
+                tokens, attention_mask, position_ids = process_batch(
+                    data, args)
+                batch_size = tokens.size(0)
+                beam_scorer = BeamSearchScorer(
+                    batch_size=batch_size,
+                    max_length=args.out_seq_length,
+                    num_beams=args.num_beams,
+                    device=tokens.device,
+                    length_penalty=args.length_penalty,
+                    do_early_stopping=False,
+                )
+                beam_scores = torch.zeros((batch_size, args.num_beams),
+                                          dtype=torch.float,
+                                          device=tokens.device)
+                beam_scores[:, 1:] = -1e9
+                beam_scores = beam_scores.view((batch_size * args.num_beams, ))
+                # Run the model forward.
+                counter = 0
+                while counter < args.tgt_seq_length:
+                    if counter == 0:
+                        next_token_logits, *mems = model(
+                            tokens,
+                            position_ids,
+                            attention_mask,
+                            return_memory=True)
+                        seq_length = next_token_logits.size(1)
+                        next_token_logits = next_token_logits[:, -1]
+                        next_token_logits = next_token_logits.unsqueeze(
+                            1).repeat(1, args.num_beams,
+                                      1).view(batch_size * args.num_beams, -1)
+                        mems = [
+                            mem.unsqueeze(1).repeat(
+                                1, args.num_beams, 1,
+                                1).view(batch_size * args.num_beams,
+                                        seq_length, -1) for mem in mems
+                        ]
+                        position_ids = tokens.new_ones(batch_size,
+                                                       args.num_beams, 2, 1)
+                        for i, text in enumerate(tokens.tolist()):
+                            mask_pos = text.index(self.mask_token)
+                            position_ids[i, :, 0] = mask_pos
+                        position_ids = position_ids.reshape(
+                            batch_size * args.num_beams, 2, 1)
+                        tokens = tokens.new_zeros(batch_size * args.num_beams,
+                                                  0)
+                        attention_mask = tokens.new_zeros(
+                            [batch_size * args.num_beams])
+                    else:
+                        if not args.no_block_position:
+                            position_ids[:, 1] = counter + 1
+                        last_token = tokens[:, -1:]
+                        next_token_logits, *mems = model(
+                            last_token,
+                            position_ids,
+                            attention_mask,
+                            *mems,
+                            return_memory=True)
+                        next_token_logits = next_token_logits[:, -1]
+                    next_token_scores = F.log_softmax(
+                        next_token_logits, dim=-1)
+                    next_token_scores = self.processors(
+                        tokens, next_token_scores)
+                    next_token_scores = next_token_scores + beam_scores[:, None].expand_as(
+                        next_token_scores)
+                    vocab_size = next_token_scores.shape[-1]
+                    next_token_scores = next_token_scores.view(
+                        batch_size, args.num_beams * vocab_size)
+
+                    probs = F.softmax(next_token_scores, dim=-1)
+                    if args.select_topk:
+                        _, next_tokens = torch.topk(
+                            probs, k=2 * args.num_beams, dim=-1, largest=True)
+                    else:
+                        next_tokens = torch.multinomial(
+                            probs, num_samples=2 * args.num_beams)
+                    next_token_scores = torch.gather(next_token_scores, -1,
+                                                     next_tokens)
+                    next_token_scores, _indices = torch.sort(
+                        next_token_scores, descending=True, dim=1)
+                    next_tokens = torch.gather(next_tokens, -1, _indices)
+
+                    next_indices = next_tokens // vocab_size
+                    next_tokens = next_tokens % vocab_size
+                    # stateless
+                    beam_outputs = beam_scorer.process(
+                        tokens,
+                        next_token_scores,
+                        next_tokens,
+                        next_indices,
+                        eos_token_id=self.end_token,
+                        pad_token_id=self.pad_token)
+                    beam_scores = beam_outputs['next_beam_scores']
+                    beam_next_tokens = beam_outputs['next_beam_tokens']
+                    beam_idx = beam_outputs['next_beam_indices']
+                    beam_next_tokens = beam_next_tokens.unsqueeze(-1)
+                    tokens = torch.cat([tokens[beam_idx, :], beam_next_tokens],
+                                       dim=-1)
+                    mems = [mem[beam_idx] for mem in mems] if mems else []
+                    if beam_scorer.is_done:
+                        break
+                    counter += 1
+                tokens, _ = beam_scorer.finalize(
+                    tokens,
+                    beam_scores,
+                    next_tokens,
+                    next_indices,
+                    eos_token_id=self.end_token,
+                    pad_token_id=self.pad_token)
+                predictions = []
+                for text in tokens.tolist():
+                    text = [
+                        token for token in text
+                        if token not in [self.end_token, self.pad_token]
+                    ]
+                    text = self.tokenizer.DecodeIds(text)
+                    predictions.append(text)
+                uid_list = data['uid']
+                if isinstance(uid_list, torch.Tensor):
+                    uid_list = uid_list.cpu().numpy().tolist()
+                for uid, prediction in zip(uid_list, predictions):
+                    store.set(uid, prediction)
+                if (idx + 1) % args.log_interval == 0:
+                    print_rank_0(f'Iteration {idx + 1} / {len(dataloader)}')
+        model.train()
+        torch.distributed.barrier()
+        print_rank_0('Evaluation completed')
+        predictions, examples = [], []
+        for uid, example in example_dict.items():
+            predictions.append(store.get(uid).decode('utf-8'))
+            examples.append(example)
+        torch.distributed.barrier()
+        return predictions, [], examples
+
+
+def blanklm_fix_tokenization(text):
+    text = text.replace('` `', '``')
+    text = text.replace("\' \'", "\'\'")
+    text = text.replace("n \' t", "n\'t")
+    text = text.replace("\' s", "\'s")
+    text = text.replace("\' m", "\'m")
+    text = text.replace("\' re", "\'re")
+    text = text.replace('. . .', '...')
+    text = text.replace(' . .', ' ..')
+    text = text.replace('- -', '--')
+    text = text.replace('u . s .', 'u.s.')
+    text = text.replace('u . k .', 'u.k.')
+    text = text.replace('e . g .', 'e.g.')
+    return text
+
+
+class BlankLMEvaluater(DecoderEvaluater):
+
+    def evaluate(self, model, dataloader, example_dict, args):
+        model.eval()
+        store = torch.distributed.TCPStore(args.master_ip,
+                                           18931 + random.randint(0, 10000),
+                                           mpu.get_data_parallel_world_size(),
+                                           torch.distributed.get_rank() == 0,
+                                           datetime.timedelta(seconds=30))
+        print_rank_0('Distributed store created')
+
+        with torch.no_grad():
+            for idx, data in enumerate(dataloader):
+                tokens, attention_mask, position_ids = process_batch(
+                    data, args)
+                src_tokens = tokens
+                batch_size = tokens.size(0)
+                mask_positions = []
+                current_mask = []
+                for text in tokens.tolist():
+                    mask_positions.append([
+                        i for i, x in enumerate(text) if x == self.mask_token
+                    ])
+                    current_mask.append(0)
+                    # print(self.tokenizer.DecodeIds(text))
+                    # print(mask_positions[-1])
+                counter = 0
+                done = [False] * batch_size
+                while counter < args.tgt_seq_length:
+                    if counter == 0:
+                        # print(tokens)
+                        # print(position_ids)
+                        next_token_logits, *mems = model(
+                            tokens,
+                            position_ids,
+                            attention_mask,
+                            return_memory=True)
+                        next_token_logits = next_token_logits[:, -1]
+                        position_ids = tokens.new_ones(batch_size, 2, 1)
+                        for i, text in enumerate(tokens.tolist()):
+                            mask_pos = mask_positions[i][current_mask[i]]
+                            position_ids[i, 0] = mask_pos
+                        tokens = tokens.new_zeros(batch_size, 0)
+                        attention_mask = tokens.new_zeros(batch_size)
+                    else:
+                        position_ids[:, 1] = position_ids[:, 1] + 1
+                        last_token = tokens[:, -1:]
+                        next_token_logits, *mems = model(
+                            last_token,
+                            position_ids,
+                            attention_mask,
+                            *mems,
+                            return_memory=True)
+                        next_token_logits = next_token_logits[:, -1]
+                    next_token_scores = F.log_softmax(
+                        next_token_logits, dim=-1)
+                    next_token_scores = self.processors(
+                        tokens, next_token_scores)
+                    next_tokens = next_token_scores.max(dim=-1)[1]
+                    # print(self.tokenizer.DecodeIds(next_tokens.tolist()))
+                    for i, next_token in enumerate(next_tokens.tolist()):
+                        if next_token == self.end_token:
+                            if current_mask[i] + 1 < len(mask_positions[i]):
+                                current_mask[i] += 1
+                                next_tokens[i] = self.start_token
+                                position_ids[i, 0] = mask_positions[i][
+                                    current_mask[i]]
+                                position_ids[i, 1] = 0
+                            else:
+                                done[i] = True
+                        if done[i]:
+                            next_tokens[i] = self.pad_token
+                    if all(done):
+                        break
+                    tokens = torch.cat(
+                        [tokens, next_tokens.unsqueeze(-1)], dim=-1)
+                    counter += 1
+                predictions = []
+                for i, text in enumerate(tokens.tolist()):
+                    text = [
+                        token for token in text
+                        if token not in [self.end_token, self.pad_token]
+                    ]
+                    blanks = [[]]
+                    for token in text:
+                        if token == self.start_token:
+                            blanks.append([])
+                        else:
+                            blanks[-1].append(token)
+                    output_tokens = []
+                    current_blank = 0
+                    for token in src_tokens[i].tolist():
+                        if token == self.mask_token:
+                            if current_blank < len(blanks):
+                                output_tokens += blanks[current_blank]
+                            current_blank += 1
+                        else:
+                            if token not in [self.pad_token]:
+                                output_tokens.append(token)
+                    text = self.tokenizer.DecodeIds(output_tokens[:-1])
+                    text = blanklm_fix_tokenization(text)
+                    predictions.append(text)
+                    # print(text)
+                uid_list = data['uid']
+                if isinstance(uid_list, torch.Tensor):
+                    uid_list = uid_list.cpu().numpy().tolist()
+                for uid, prediction in zip(uid_list, predictions):
+                    store.set(uid, prediction)
+                if (idx + 1) % args.log_interval == 0:
+                    print_rank_0(f'Iteration {idx + 1} / {len(dataloader)}')
+
+        model.train()
+        torch.distributed.barrier()
+        print_rank_0('Evaluation completed')
+        predictions, examples = [], []
+        for uid, example in example_dict.items():
+            predictions.append(store.get(uid).decode('utf-8'))
+            examples.append(example)
+        torch.distributed.barrier()
+        return predictions, [], examples
diff --git a/modelscope/models/nlp/mglm/tasks/seq2seq/finetune.py b/modelscope/models/nlp/mglm/tasks/seq2seq/finetune.py
new file mode 100644
index 00000000..4c0c28e7
--- /dev/null
+++ b/modelscope/models/nlp/mglm/tasks/seq2seq/finetune.py
@@ -0,0 +1,151 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Race."""
+import functools
+from collections import OrderedDict
+
+import mpu
+import torch
+from finetune_glm import finetune
+from pretrain_glm import get_batch
+from tasks.eval_utils import accuracy_func_provider
+from tasks.seq2seq.dataset import (BlankLMDataset, ExtractionDataset,
+                                   Seq2SeqDataset)
+from tasks.seq2seq.evaluate import (BlankLMEvaluater, DecoderEvaluater,
+                                    rouge_metric)
+
+global_tokenizer = None
+
+
+def seq2seq_forward_step(data, model, args, timers, mems):
+    """Forward step."""
+
+    # Get the batch.
+    if timers is not None:
+        timers('batch generator').start()
+    tokens, labels, loss_mask, attention_mask, position_ids = get_batch(
+        data, args)
+    if timers is not None:
+        timers('batch generator').stop()
+    # Forward model.
+    logits, *mems = model(tokens, position_ids, attention_mask, *mems)
+    # logits, loss_mask = logits[:, args.src_seq_length:], loss_mask[:, args.src_seq_length:]
+    # target_ids = target_ids[:, args.src_seq_length:]
+    losses = mpu.vocab_parallel_cross_entropy(logits.contiguous().float(),
+                                              labels)
+    if args.label_smoothing > 0.0:
+        epsilon = args.label_smoothing
+        smooth_loss = -torch.nn.functional.log_softmax(
+            logits, dim=-1).mean(dim=-1)
+        losses = (1 - epsilon) * losses + epsilon * smooth_loss
+    loss_mask = loss_mask.reshape(-1)
+    # The loss is not normalized for fair comparison
+    loss = torch.sum(losses.reshape(-1) * loss_mask) / loss_mask.sum()
+    return loss, mems, 'bert'
+
+
+def train_valid_datasets_provider(args, tokenizer):
+    """Provide train and validation datasets."""
+    if args.task.lower() == 'blank':
+        train_dataset = BlankLMDataset(
+            args, split='train', tokenizer=tokenizer)
+        valid_dataset = None
+    elif args.task.lower() == 'extraction':
+        train_dataset = ExtractionDataset(
+            args, split='train', tokenizer=tokenizer)
+        valid_dataset = None
+    else:
+        train_dataset = Seq2SeqDataset(
+            args, split='train', tokenizer=tokenizer)
+        valid_dataset = None
+    global global_tokenizer
+    global_tokenizer = tokenizer
+    return train_dataset, valid_dataset
+
+
+def metrics_func_provider(args, tokenizer, is_test):
+    """Provide metrics callback function."""
+
+    def single_dataset_provider(split):
+        if args.task.lower() == 'blank':
+            return BlankLMDataset(args, split=split, tokenizer=tokenizer)
+        elif args.task.lower() == 'extraction':
+            return ExtractionDataset(args, split=split, tokenizer=tokenizer)
+        else:
+            return Seq2SeqDataset(args, split=split, tokenizer=tokenizer)
+
+    if args.task.lower() in ['blank', 'extraction']:
+        evaluater = BlankLMEvaluater(args, tokenizer)
+        eval_func = evaluater.evaluate
+        metric_dict = {}
+    else:
+        evaluater = DecoderEvaluater(args, tokenizer)
+        eval_func = evaluater.evaluate
+        if args.tokenizer_type == 'BertWordPieceTokenizer':
+            dataset = 'cnn_dm'
+        elif args.task.lower() == 'gigaword':
+            dataset = 'gigaword'
+        else:
+            dataset = 'cnn_dm_org'
+        metric_dict = OrderedDict({
+            'rouge-1':
+            functools.partial(rouge_metric, metric='rouge-1', dataset=dataset),
+            'rouge-2':
+            functools.partial(rouge_metric, metric='rouge-2', dataset=dataset),
+            'rouge-l':
+            functools.partial(rouge_metric, metric='rouge-l', dataset=dataset)
+        })
+
+    def output_func(predictions, examples, output_file):
+        with open(output_file + '.hyps', 'w', encoding='utf-8') as output:
+            for prediction in predictions:
+                output.write(prediction)
+                output.write('\n')
+        with open(output_file + '.refs', 'w', encoding='utf-8') as output:
+            for example in examples:
+                output.write(example.meta['ref'])
+                output.write('\n')
+        if args.task.lower() == 'squad_generation':
+            with open(
+                    output_file + '.source', 'w', encoding='utf-8') as output:
+                for example in examples:
+                    output.write(
+                        example.text_a.replace('\n', ' ') + ' Answer: '
+                        + example.meta['answer'])
+                    output.write('\n')
+
+    return accuracy_func_provider(
+        single_dataset_provider,
+        metric_dict,
+        args,
+        is_test=is_test,
+        eval_func=eval_func,
+        output_func=output_func,
+        only_rank0=False)
+
+
+def main(args):
+    if args.src_seq_length > args.max_position_embeddings:
+        args.max_position_embeddings = args.src_seq_length
+    if args.task.lower() in [
+            'cnn_dm', 'cnn_dm_original', 'gigaword', 'blank',
+            'squad_generation', 'xsum', 'extraction'
+    ]:
+        finetune(
+            args,
+            train_valid_datasets_provider, {},
+            end_of_epoch_callback_provider=metrics_func_provider,
+            forward_step=seq2seq_forward_step)
+    else:
+        raise NotImplementedError(args.task)
diff --git a/modelscope/models/nlp/mglm/tasks/superglue/README.md b/modelscope/models/nlp/mglm/tasks/superglue/README.md
new file mode 100644
index 00000000..94aab0e9
--- /dev/null
+++ b/modelscope/models/nlp/mglm/tasks/superglue/README.md
@@ -0,0 +1,137 @@
+# Use GLM for your NLU tasks
+To use GLM for your own NLU tasks, you should implement a subclass of `DataProcessor` in [tasks/superglue/dataset.py](dataset.py) and a subclass of `PVP` in [tasks/superglue/pvp.py](pvp.py). You should also specify the  We will take the RTE and ReCoRD tasks in SuperGLUE as an example.
+
+## 1. Design your patterns
+RTE is an NLI task in which the model is required to predict text entailment between a premise and a hypothesis. The label can be `entailment` or `not_entailment` One sample from the training set is
+```
+premise: No Weapons of Mass Destruction Found in Iraq Yet.
+hypothesis: Weapons of Mass Destruction Found in Iraq.
+label: not_entailment
+```
+We design the pattern as
+```
+"`hypothesis`"?, [MASK], "`premise`"
+```
+GLM predicts "Yes" for `entailment` and "No" for `not_entailment`. "Yes" and "No" are called verbalizers for `entailment` and `not_entailment`.
+
+ReCoRD is a multi-choice QA task. Each example consists of a news article and a Cloze-style question about the article in which one entity is masked out. The system must predict the masked out entity from a list of possible entities in the provided passage. We directly adopt the cloze-style question as our pattern and use GLM to predict the masked entity.
+
+## 2. Implement subclass of `DataProcessor`
+A subclass of `DataProcessor` should implement `get_train_examples`, `get_dev_examples` and `get_test_examples`, which return the examples of the train, dev, and test sets. The returned value is a list of `InputExample`. It should also implement `get_labels` to return the list of possible labels. Hete we take the `RTEProcessor` as an example:
+```python
+class RteProcessor(DataProcessor):
+    """Processor for the RTE data set."""
+
+    def get_train_examples(self, data_dir):
+        return self._create_examples(os.path.join(data_dir, "train.jsonl"), "train")
+
+    def get_dev_examples(self, data_dir, for_train=False):
+        return self._create_examples(os.path.join(data_dir, "val.jsonl"), "dev")
+
+    def get_test_examples(self, data_dir):
+        return self._create_examples(os.path.join(data_dir, "test.jsonl"), "test")
+
+    def get_unlabeled_examples(self, data_dir):
+        return self._create_examples(os.path.join(data_dir, "unlabeled.jsonl"), "unlabeled")
+
+    def get_labels(self):
+        return ["entailment", "not_entailment"]
+
+    def _create_examples(self, path: str, set_type: str, hypothesis_name: str = "hypothesis",
+                         premise_name: str = "premise") -> List[InputExample]:
+        examples = []
+
+        with open(path, encoding='utf8') as f:
+            for line_idx, line in enumerate(f):
+                example_json = json.loads(line)
+                idx = example_json['idx']
+                if isinstance(idx, str):
+                    try:
+                        idx = int(idx)
+                    except ValueError:
+                        idx = line_idx
+                label = example_json.get('label')
+                guid = "%s-%s" % (set_type, idx)
+                text_a = example_json[premise_name]
+                text_b = example_json[hypothesis_name]
+
+                example = InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, idx=idx)
+                examples.append(example)
+
+        return examples
+```
+After that, you should add the implemented class to ``PROCESSORS`` at the end of [tasks/superglue/dataset.py](dataset.py):
+```python
+PROCESSORS = {
+    ...
+    "rte": RteProcessor
+}
+```
+
+## 3. Implement subclass of `PVP`
+To implement a subclass of `PVP`, you should first decide your verbalizers is single-token or multi-token. The verbalizers in RTE, "Yes" and "No" are single-token. Instead, the verbalizers in ReCoRD are multi-token, as one entity can be tokenized into multiple tokens with WordPiece or BPE tokenizer.
+
+For single-token task, you should set `is_multi_token=False` in the class definition. You should implement `get_parts` to return the inputs to GLM given an example and `verbalize` to return the verbalizer given a label. Take `RTEPVP` as an example:
+```python
+class RtePVP(PVP):
+    is_multi_token = False
+    VERBALIZER = {
+        "not_entailment": [" No"],
+        "entailment": [" Yes"]
+    }
+
+    @property
+    def spell_length(self):
+        return self.pattern_id
+
+    def get_parts(self, example: InputExample) -> FilledPattern:
+        # switch text_a and text_b to get the correct order
+        text_a = example.text_a
+        text_b = example.text_b.rstrip(string.punctuation)
+        return ['"', self.shortenable(text_b), '" ?'], [[self.mask], ', "', self.shortenable(text_a), '"']
+
+    def verbalize(self, label) -> List[str]:
+        return RtePVP.VERBALIZER[label]
+```
+We use `PvP.shortenable` to mark the segments that can be truncated when exceeding the maximum sequence length.
+
+For multi-token task, you should set `is_multi_token=True` in the class definition. You should implement `get_parts` to return the inputs to GLM given an example and `get_answers` to return the candidates. Take `ReCoRDPVP` as an example:
+```python
+class RecordPVP(PVP):
+    is_multi_token = True
+
+    def get_answers(self, example: InputExample):
+        choices = example.meta['candidates']
+        choices = [" " + choice for choice in choices]
+        return choices
+
+    def get_parts(self, example: InputExample) -> FilledPattern:
+        premise = self.shortenable(example.text_a)
+
+        assert '@placeholder' in example.text_b, f'question "{example.text_b}" does not contain a @placeholder token'
+        question_a, question_b = example.text_b.split('@placeholder')
+        return [premise, " " + question_a.rstrip(), [self.mask], question_b], []
+```
+After that, you should implement the class to `PVPS` at the end of [tasks/superglue/pvp.py](pvp.py):
+```python
+PVPS = {
+    ...
+    'rte': RtePVP,
+    'record': RecordPVP
+}
+```
+## 4. Run the experiment
+To run the experiment for your new task, you should create a config file like [config_tasks/task_rte.sh](/config_tasks/task_rte.sh). You should also specify the evaluation metrics for the task in `DEFAULT_METRICS` of [tasks/superglue/finetune.py](finetune.py):
+```python
+DEFAULT_METRICS = {
+    ...
+    "record": [("EM", qa_exact_match), ("F1", qa_f1)],
+    "rte": [("accuracy", accuracy_metric)]
+}
+```
+Then you can run the experiment with [finetune_superglue.sh](/scripts/finetune_superglue.sh):
+```shell
+bash scripts/finetune_superglue.sh \
+     config_tasks/model_blocklm_large.sh \
+     config_tasks/task_rte.sh
+```
diff --git a/modelscope/models/nlp/mglm/tasks/superglue/__init__.py b/modelscope/models/nlp/mglm/tasks/superglue/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/nlp/mglm/tasks/superglue/dataset.py b/modelscope/models/nlp/mglm/tasks/superglue/dataset.py
new file mode 100644
index 00000000..36367671
--- /dev/null
+++ b/modelscope/models/nlp/mglm/tasks/superglue/dataset.py
@@ -0,0 +1,1475 @@
+# Copyright (c) 2022 Zhipu.AI
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This file contains the logic for loading training and test data for all tasks.
+"""
+
+import copy
+import csv
+import glob
+import os
+import random
+import re
+from abc import ABC, abstractmethod
+from collections import Counter, defaultdict
+from typing import Callable, Dict, List
+
+import json
+import numpy as np
+import pandas as pd
+from data_utils import (build_input_from_ids, build_sample,
+                        num_special_tokens_to_add)
+from data_utils.corpora import punctuation_standardization
+from torch.utils.data import Dataset
+from tqdm import tqdm
+from utils import print_rank_0
+
+from modelscope.models.nlp.mglm.tasks.data_utils import InputExample
+from modelscope.models.nlp.mglm.tasks.superglue.pvp import PVPS
+
+TRAIN_SET = 'train'
+DEV_SET = 'dev'
+TEST_SET = 'test'
+TRUE_DEV_SET = 'true_dev'
+UNLABELED_SET = 'unlabeled'
+
+SPLIT_TYPES = [TRAIN_SET, DEV_SET, TEST_SET, TRUE_DEV_SET, UNLABELED_SET]
+
+
+def get_output_func(task_name, args):
+    return PROCESSORS[task_name](args).output_prediction
+
+
+def read_tsv(path, **kwargs):
+    return pd.read_csv(
+        path,
+        sep='\t',
+        quoting=csv.QUOTE_NONE,
+        dtype=str,
+        na_filter=False,
+        **kwargs)
+
+
+class SuperGlueDataset(Dataset):
+
+    def __init__(self,
+                 args,
+                 task_name,
+                 data_dir,
+                 seq_length,
+                 split,
+                 tokenizer,
+                 for_train=False,
+                 pattern_ensemble=False,
+                 pattern_text=False):
+        self.processor = PROCESSORS[task_name](args)
+        args.variable_num_choices = self.processor.variable_num_choices
+        print_rank_0(
+            f'Creating {task_name} dataset from file at {data_dir} (split={split})'
+        )
+        self.dataset_name = f'{task_name}-{split}'
+        self.cloze_eval = args.cloze_eval
+        self.seq_length = seq_length
+        self.tokenizer = tokenizer
+        self.pattern_ensemble = pattern_ensemble
+        self.pattern_text = pattern_text
+        if pattern_text:
+            assert self.cloze_eval, 'Labeled examples only exist in cloze evaluation'
+        self.args = args
+        if split == DEV_SET:
+            example_list = self.processor.get_dev_examples(
+                data_dir, for_train=for_train)
+        elif split == TEST_SET:
+            example_list = self.processor.get_test_examples(data_dir)
+        elif split == TRUE_DEV_SET:
+            example_list = self.processor.get_true_dev_examples(data_dir)
+        elif split == TRAIN_SET:
+            if task_name == 'wsc':
+                example_list = self.processor.get_train_examples(
+                    data_dir, cloze_eval=args.cloze_eval)
+            else:
+                example_list = self.processor.get_train_examples(data_dir)
+        elif split == UNLABELED_SET:
+            example_list = self.processor.get_unlabeled_examples(data_dir)
+            for example in example_list:
+                example.label = self.processor.get_labels()[0]
+        else:
+            raise ValueError(
+                f"'split' must be one of {SPLIT_TYPES}, got '{split}' instead")
+        if split == TEST_SET:
+            self.labeled = False
+        else:
+            self.labeled = True
+
+        label_distribution = Counter(example.label for example in example_list)
+        print_rank_0(
+            f'Returning {len(example_list)} {split} examples with label dist.: {list(label_distribution.items())}'
+        )
+        self.samples = []
+        example_list.sort(key=lambda x: x.num_choices)
+        self.example_list = example_list
+        if self.cloze_eval:
+            if self.pattern_ensemble:
+                pattern_ids = PVPS[task_name].available_patterns()
+                self.pvps = []
+                for pattern_id in pattern_ids:
+                    self.pvps.append(PVPS[task_name](
+                        args,
+                        tokenizer,
+                        self.processor.get_labels(),
+                        seq_length,
+                        pattern_id=pattern_id,
+                        num_prompt_tokens=args.num_prompt_tokens,
+                        is_multi_token=args.multi_token,
+                        max_segment_length=args.segment_length,
+                        fast_decode=args.fast_decode,
+                        split=split))
+            else:
+                self.pvp = PVPS[task_name](
+                    args,
+                    tokenizer,
+                    self.processor.get_labels(),
+                    seq_length,
+                    pattern_id=args.pattern_id,
+                    num_prompt_tokens=args.num_prompt_tokens,
+                    is_multi_token=args.multi_token,
+                    max_segment_length=args.segment_length,
+                    fast_decode=args.fast_decode,
+                    split=split)
+        self.examples = {example.guid: example for example in example_list}
+
+    def __len__(self):
+        if self.cloze_eval and self.pattern_ensemble:
+            return len(self.example_list) * len(self.pvps)
+        else:
+            return len(self.example_list)
+
+    def __getitem__(self, idx):
+        sample_idx = idx % len(self.example_list)
+        example = self.example_list[sample_idx]
+        if self.cloze_eval:
+            kwargs = {}
+            if self.pattern_text:
+                kwargs = {'labeled': True, 'priming': True}
+            if self.pattern_ensemble:
+                pvp_idx = idx // len(self.example_list)
+                sample = self.pvps[pvp_idx].encode(example, **kwargs)
+            else:
+                sample = self.pvp.encode(example, **kwargs)
+            if self.pattern_text:
+                eos_id = self.tokenizer.get_command('eos').Id
+                cls_id = self.tokenizer.get_command('ENC').Id
+                input_ids = [cls_id] + sample + [eos_id]
+                sample = {
+                    'text': input_ids,
+                    'loss_mask': np.array([1] * len(input_ids))
+                }
+        else:
+            sample = self.processor.encode(example, self.tokenizer,
+                                           self.seq_length, self.args)
+        return sample
+
+
+class DataProcessor(ABC):
+    """
+    Abstract class that provides methods for loading training, testing, development and unlabeled examples for a given
+    task
+    """
+
+    def __init__(self, args):
+        self.args = args
+        self.num_truncated = 0
+
+    def output_prediction(self, predictions, examples, output_file):
+        with open(output_file, 'w') as output:
+            for prediction, example in zip(predictions, examples):
+                prediction = self.get_labels()[prediction]
+                data = {'idx': example.idx, 'label': prediction}
+                output.write(json.dumps(data) + '\n')
+
+    @property
+    def variable_num_choices(self):
+        return False
+
+    @abstractmethod
+    def get_train_examples(self, data_dir) -> List[InputExample]:
+        """Get a collection of `InputExample`s for the train set."""
+        pass
+
+    @abstractmethod
+    def get_dev_examples(self,
+                         data_dir,
+                         for_train=False) -> List[InputExample]:
+        """Get a collection of `InputExample`s for the dev set."""
+        pass
+
+    def get_test_examples(self, data_dir) -> List[InputExample]:
+        """Get a collection of `InputExample`s for the test set."""
+        return []
+
+    def get_unlabeled_examples(self, data_dir) -> List[InputExample]:
+        """Get a collection of `InputExample`s for the unlabeled set."""
+        return []
+
+    @abstractmethod
+    def get_labels(self) -> List[str]:
+        """Get the list of labels for this data set."""
+        pass
+
+    def get_classifier_input(self, example: InputExample, tokenizer):
+        return example.text_a, example.text_b
+
+    def encode(self, example: InputExample, tokenizer, seq_length, args):
+        text_a, text_b = self.get_classifier_input(example, tokenizer)
+        tokens_a = tokenizer.EncodeAsIds(text_a).tokenization
+        tokens_b = tokenizer.EncodeAsIds(text_b).tokenization
+        num_special_tokens = num_special_tokens_to_add(
+            tokens_a,
+            tokens_b,
+            None,
+            add_cls=True,
+            add_sep=True,
+            add_piece=False)
+        if len(tokens_a) + len(tokens_b) + num_special_tokens > seq_length:
+            self.num_truncated += 1
+        data = build_input_from_ids(
+            tokens_a,
+            tokens_b,
+            None,
+            seq_length,
+            tokenizer,
+            args=args,
+            add_cls=True,
+            add_sep=True,
+            add_piece=False)
+        ids, types, paddings, position_ids, sep, target_ids, loss_masks = data
+        label = 0
+        if example.label is not None:
+            label = example.label
+            label = self.get_labels().index(label)
+        if args.pretrained_bert:
+            sample = build_sample(
+                ids,
+                label=label,
+                types=types,
+                paddings=paddings,
+                unique_id=example.guid)
+        else:
+            sample = build_sample(
+                ids,
+                positions=position_ids,
+                masks=sep,
+                label=label,
+                unique_id=example.guid)
+        return sample
+
+
+class SuperGLUEProcessor(DataProcessor):
+
+    def __init__(self, args):
+        super(SuperGLUEProcessor, self).__init__(args)
+        self.few_superglue = args.few_superglue
+
+    def get_train_examples(self, data_dir):
+        return self._create_examples(
+            os.path.join(data_dir, 'train.jsonl'), 'train')
+
+    def get_dev_examples(self, data_dir, for_train=False):
+        if self.few_superglue:
+            return self._create_examples(
+                os.path.join(data_dir, 'dev32.jsonl'), 'dev')
+        else:
+            return self._create_examples(
+                os.path.join(data_dir, 'val.jsonl'), 'dev')
+
+    def get_test_examples(self, data_dir):
+        if self.few_superglue:
+            return self._create_examples(
+                os.path.join(data_dir, 'val.jsonl'), 'test')
+        else:
+            return self._create_examples(
+                os.path.join(data_dir, 'test.jsonl'), 'test')
+
+    def get_unlabeled_examples(self, data_dir):
+        return self._create_examples(
+            os.path.join(data_dir, 'unlabeled.jsonl'), 'unlabeled')
+
+    def _create_examples(self, *args, **kwargs):
+        pass
+
+
+class RteProcessor(SuperGLUEProcessor):
+    """Processor for the RTE data set."""
+
+    def get_labels(self):
+        return ['entailment', 'not_entailment']
+
+    def _create_examples(self,
+                         path: str,
+                         set_type: str,
+                         hypothesis_name: str = 'hypothesis',
+                         premise_name: str = 'premise') -> List[InputExample]:
+        examples = []
+
+        with open(path, encoding='utf8') as f:
+            for line_idx, line in enumerate(f):
+                example_json = json.loads(line)
+                idx = example_json['idx']
+                if isinstance(idx, str):
+                    try:
+                        idx = int(idx)
+                    except ValueError:
+                        idx = line_idx
+                label = example_json.get('label')
+                guid = '%s-%s' % (set_type, idx)
+                text_a = punctuation_standardization(
+                    example_json[premise_name])
+                text_b = punctuation_standardization(
+                    example_json[hypothesis_name])
+
+                example = InputExample(
+                    guid=guid,
+                    text_a=text_a,
+                    text_b=text_b,
+                    label=label,
+                    idx=idx)
+                examples.append(example)
+
+        return examples
+
+
+class AxGProcessor(RteProcessor):
+    """Processor for the AX-G diagnostic data set."""
+
+    def get_train_examples(self, data_dir):
+        return self._create_examples(
+            os.path.join(data_dir, 'AX-g.jsonl'), 'train')
+
+    def get_test_examples(self, data_dir):
+        return self._create_examples(
+            os.path.join(data_dir, 'AX-g.jsonl'), 'test')
+
+
+class AxBProcessor(RteProcessor):
+    """Processor for the AX-B diagnostic data set."""
+
+    def get_train_examples(self, data_dir):
+        return self._create_examples(
+            os.path.join(data_dir, 'AX-b.jsonl'), 'train')
+
+    def get_test_examples(self, data_dir):
+        return self._create_examples(
+            os.path.join(data_dir, 'AX-b.jsonl'), 'test')
+
+    def _create_examples(self,
+                         path,
+                         set_type,
+                         hypothesis_name='sentence2',
+                         premise_name='sentence1'):
+        return super()._create_examples(path, set_type, hypothesis_name,
+                                        premise_name)
+
+
+class CbProcessor(RteProcessor):
+    """Processor for the CB data set."""
+
+    def get_labels(self):
+        return ['entailment', 'contradiction', 'neutral']
+
+
+class WicProcessor(SuperGLUEProcessor):
+    """Processor for the WiC data set."""
+
+    def get_labels(self):
+        return ['false', 'true']
+
+    @staticmethod
+    def _create_examples(path: str, set_type: str) -> List[InputExample]:
+        examples = []
+        with open(path, encoding='utf8') as f:
+            for line in f:
+                example_json = json.loads(line)
+                idx = example_json['idx']
+                if isinstance(idx, str):
+                    idx = int(idx)
+                label = 'true' if example_json.get('label') else 'false'
+                guid = '%s-%s' % (set_type, idx)
+                text_a = punctuation_standardization(example_json['sentence1'])
+                text_b = punctuation_standardization(example_json['sentence2'])
+                meta = {'word': example_json['word']}
+                example = InputExample(
+                    guid=guid,
+                    text_a=text_a,
+                    text_b=text_b,
+                    label=label,
+                    idx=idx,
+                    meta=meta)
+                examples.append(example)
+        return examples
+
+    def get_classifier_input(self, example: InputExample, tokenizer):
+        text_a = example.meta['word'] + ': ' + example.text_a
+        return text_a, example.text_b
+
+
+class WscProcessor(SuperGLUEProcessor):
+    """Processor for the WSC data set."""
+
+    @property
+    def variable_num_choices(self):
+        return self.args.wsc_negative
+
+    def get_train_examples(self, data_dir, cloze_eval=True):
+        return self._create_examples(
+            os.path.join(data_dir, 'train.jsonl'),
+            'train',
+            cloze_eval=cloze_eval)
+
+    def get_labels(self):
+        return ['False', 'True']
+
+    def get_classifier_input(self, example: InputExample, tokenizer):
+        target = example.meta['span1_text']
+        pronoun_idx = example.meta['span2_index']
+
+        # mark the pronoun with asterisks
+        words_a = example.text_a.split()
+        words_a[pronoun_idx] = '*' + words_a[pronoun_idx] + '*'
+        text_a = ' '.join(words_a)
+        text_b = target
+        return text_a, text_b
+
+    def _create_examples(self,
+                         path: str,
+                         set_type: str,
+                         cloze_eval=True) -> List[InputExample]:
+        examples = []
+
+        with open(path, encoding='utf8') as f:
+            for line in f:
+                example_json = json.loads(line)
+                idx = example_json['idx']
+                label = str(
+                    example_json['label']) if 'label' in example_json else None
+                guid = '%s-%s' % (set_type, idx)
+                text_a = punctuation_standardization(example_json['text'])
+                meta = {
+                    'span1_text': example_json['target']['span1_text'],
+                    'span2_text': example_json['target']['span2_text'],
+                    'span1_index': example_json['target']['span1_index'],
+                    'span2_index': example_json['target']['span2_index']
+                }
+                if 'candidates' in example_json:
+                    candidates = [
+                        cand['text'] for cand in example_json['candidates']
+                    ]
+                    # candidates = list(set(candidates))
+                    filtered = []
+                    for i, cand in enumerate(candidates):
+                        if cand not in candidates[:i]:
+                            filtered.append(cand)
+                    candidates = filtered
+
+                # the indices in the dataset are wrong for some examples, so we manually fix them
+                span1_index, span1_text = meta['span1_index'], meta[
+                    'span1_text']
+                span2_index, span2_text = meta['span2_index'], meta[
+                    'span2_text']
+                words_a = text_a.split()
+                words_a_lower = text_a.lower().split()
+                words_span1_text = span1_text.lower().split()
+                span1_len = len(words_span1_text)
+
+                if words_a_lower[span1_index:span1_index
+                                 + span1_len] != words_span1_text:
+                    for offset in [-1, +1]:
+                        if words_a_lower[span1_index + offset:span1_index
+                                         + span1_len
+                                         + offset] == words_span1_text:
+                            span1_index += offset
+
+                # if words_a_lower[span1_index:span1_index + span1_len] != words_span1_text:
+                #     print_rank_0(f"Got '{words_a_lower[span1_index:span1_index + span1_len]}' but expected "
+                #                  f"'{words_span1_text}' at index {span1_index} for '{words_a}'")
+
+                if words_a[span2_index] != span2_text:
+                    for offset in [-1, +1]:
+                        if words_a[span2_index + offset] == span2_text:
+                            span2_index += offset
+
+                    if words_a[span2_index] != span2_text and words_a[
+                            span2_index].startswith(span2_text):
+                        words_a = words_a[:span2_index] \
+                                  + [words_a[span2_index][:len(span2_text)], words_a[span2_index][len(span2_text):]] + words_a[span2_index + 1:] # noqa
+
+                assert words_a[span2_index] == span2_text, \
+                    f"Got '{words_a[span2_index]}' but expected '{span2_text}' at index {span2_index} for '{words_a}'"
+
+                text_a = ' '.join(words_a)
+                meta['span1_index'], meta[
+                    'span2_index'] = span1_index, span2_index
+
+                if self.args.task == 'wsc1':
+                    example = InputExample(
+                        guid=guid,
+                        text_a=text_a,
+                        text_b=span1_text,
+                        label=label,
+                        meta=meta,
+                        idx=idx)
+                    examples.append(example)
+                    if set_type == 'train' and label == 'True':
+                        for cand in candidates:
+                            example = InputExample(
+                                guid=guid,
+                                text_a=text_a,
+                                text_b=cand,
+                                label='False',
+                                meta=meta,
+                                idx=idx)
+                            examples.append(example)
+                    continue
+
+                if cloze_eval and set_type == 'train' and label != 'True':
+                    continue
+                if set_type == 'train' and 'candidates' in example_json and len(
+                        candidates) > 9:
+                    for i in range(0, len(candidates), 9):
+                        _meta = copy.deepcopy(meta)
+                        _meta['candidates'] = candidates[i:i + 9]
+                        if len(_meta['candidates']) < 9:
+                            _meta['candidates'] += candidates[:9 - len(
+                                _meta['candidates'])]
+                        example = InputExample(
+                            guid=guid,
+                            text_a=text_a,
+                            label=label,
+                            meta=_meta,
+                            idx=idx)
+                        examples.append(example)
+                else:
+                    if 'candidates' in example_json:
+                        meta['candidates'] = candidates
+                    example = InputExample(
+                        guid=guid,
+                        text_a=text_a,
+                        label=label,
+                        meta=meta,
+                        idx=idx)
+                    examples.append(example)
+
+        return examples
+
+
+class BoolQProcessor(SuperGLUEProcessor):
+    """Processor for the BoolQ data set."""
+
+    def get_labels(self):
+        return ['false', 'true']
+
+    @staticmethod
+    def _create_examples(path: str, set_type: str) -> List[InputExample]:
+        examples = []
+
+        with open(path, encoding='utf8') as f:
+            for line in f:
+                example_json = json.loads(line)
+                idx = example_json['idx']
+                label = str(example_json['label']).lower(
+                ) if 'label' in example_json else None
+                guid = '%s-%s' % (set_type, idx)
+                text_a = punctuation_standardization(example_json['passage'])
+                text_b = punctuation_standardization(example_json['question'])
+                example = InputExample(
+                    guid=guid,
+                    text_a=text_a,
+                    text_b=text_b,
+                    label=label,
+                    idx=idx)
+                examples.append(example)
+
+        return examples
+
+
+class CopaProcessor(SuperGLUEProcessor):
+    """Processor for the COPA data set."""
+
+    def get_labels(self):
+        return [0, 1]
+
+    def encode(self, example: InputExample, tokenizer, seq_length, args):
+        if args.pretrained_bert:
+            ids_list, types_list, paddings_list = [], [], []
+        else:
+            ids_list, positions_list, sep_list = [], [], []
+        question = example.meta['question']
+        joiner = 'because' if question == 'cause' else 'so'
+        text_a = punctuation_standardization(example.text_a) + ' ' + joiner
+        tokens_a = tokenizer.EncodeAsIds(text_a).tokenization
+        for choice in [example.meta['choice1'], example.meta['choice2']]:
+            choice = punctuation_standardization(choice)
+            tokens_b = tokenizer.EncodeAsIds(choice).tokenization
+            num_special_tokens = num_special_tokens_to_add(
+                tokens_a,
+                tokens_b,
+                None,
+                add_cls=True,
+                add_sep=True,
+                add_piece=False)
+            if len(tokens_a) + len(tokens_b) + num_special_tokens > seq_length:
+                self.num_truncated += 1
+            data = build_input_from_ids(
+                tokens_a,
+                tokens_b,
+                None,
+                seq_length,
+                tokenizer,
+                args,
+                add_cls=True,
+                add_sep=True,
+                add_piece=False)
+            ids, types, paddings, position_ids, sep, target_ids, loss_masks = data
+            if args.pretrained_bert:
+                ids_list.append(ids)
+                types_list.append(types)
+                paddings_list.append(paddings)
+            else:
+                ids_list.append(ids)
+                positions_list.append(position_ids)
+                sep_list.append(sep)
+        label = 0
+        if example.label is not None:
+            label = example.label
+            label = self.get_labels().index(label)
+        if args.pretrained_bert:
+            sample = build_sample(
+                ids_list,
+                label=label,
+                types=types_list,
+                paddings=paddings_list,
+                unique_id=example.guid)
+        else:
+            sample = build_sample(
+                ids_list,
+                positions=positions_list,
+                masks=sep_list,
+                label=label,
+                unique_id=example.guid)
+        return sample
+
+    @staticmethod
+    def _create_examples(path: str, set_type: str) -> List[InputExample]:
+        examples = []
+
+        with open(path, encoding='utf8') as f:
+            for line in f:
+                example_json = json.loads(line)
+                label = example_json[
+                    'label'] if 'label' in example_json else None
+                idx = example_json['idx']
+                guid = '%s-%s' % (set_type, idx)
+                text_a = example_json['premise']
+                meta = {
+                    'choice1': example_json['choice1'],
+                    'choice2': example_json['choice2'],
+                    'question': example_json['question']
+                }
+                example = InputExample(
+                    guid=guid, text_a=text_a, label=label, meta=meta, idx=idx)
+                examples.append(example)
+
+        if set_type == 'train' or set_type == 'unlabeled':
+            mirror_examples = []
+            for ex in examples:
+                label = 1 if ex.label == 0 else 0
+                meta = {
+                    'choice1': ex.meta['choice2'],
+                    'choice2': ex.meta['choice1'],
+                    'question': ex.meta['question']
+                }
+                mirror_example = InputExample(
+                    guid=ex.guid + 'm',
+                    text_a=ex.text_a,
+                    label=label,
+                    meta=meta)
+                mirror_examples.append(mirror_example)
+            examples += mirror_examples
+            print_rank_0(
+                f'Added {len(mirror_examples)} mirror examples, total size is {len(examples)}...'
+            )
+        return examples
+
+
+class MultiRcProcessor(SuperGLUEProcessor):
+    """Processor for the MultiRC data set."""
+
+    def get_labels(self):
+        return [0, 1]
+
+    @staticmethod
+    def _create_examples(path: str, set_type: str) -> List[InputExample]:
+        examples = []
+
+        with open(path, encoding='utf8') as f:
+            for line in f:
+                example_json = json.loads(line)
+
+                passage_idx = example_json['idx']
+                text = punctuation_standardization(
+                    example_json['passage']['text'])
+                questions = example_json['passage']['questions']
+                for question_json in questions:
+                    question = punctuation_standardization(
+                        question_json['question'])
+                    question_idx = question_json['idx']
+                    answers = question_json['answers']
+                    for answer_json in answers:
+                        label = answer_json[
+                            'label'] if 'label' in answer_json else None
+                        answer_idx = answer_json['idx']
+                        guid = f'{set_type}-p{passage_idx}-q{question_idx}-a{answer_idx}'
+                        meta = {
+                            'passage_idx':
+                            passage_idx,
+                            'question_idx':
+                            question_idx,
+                            'answer_idx':
+                            answer_idx,
+                            'answer':
+                            punctuation_standardization(answer_json['text'])
+                        }
+                        idx = [passage_idx, question_idx, answer_idx]
+                        example = InputExample(
+                            guid=guid,
+                            text_a=text,
+                            text_b=question,
+                            label=label,
+                            meta=meta,
+                            idx=idx)
+                        examples.append(example)
+
+        question_indices = list(
+            set(example.meta['question_idx'] for example in examples))
+        label_distribution = Counter(example.label for example in examples)
+        print_rank_0(
+            f'Returning {len(examples)} examples corresponding to {len(question_indices)} questions with label '
+            f'distribution {list(label_distribution.items())}')
+        return examples
+
+    def output_prediction(self, predictions, examples, output_file):
+        with open(output_file, 'w') as output:
+            passage_dict = defaultdict(list)
+            for prediction, example in zip(predictions, examples):
+                passage_dict[example.meta['passage_idx']].append(
+                    (prediction, example))
+            for passage_idx, data in passage_dict.items():
+                question_dict = defaultdict(list)
+                passage_data = {
+                    'idx': passage_idx,
+                    'passage': {
+                        'questions': []
+                    }
+                }
+                for prediction, example in data:
+                    question_dict[example.meta['question_idx']].append(
+                        (prediction, example))
+                for question_idx, data in question_dict.items():
+                    question_data = {'idx': question_idx, 'answers': []}
+                    for prediction, example in data:
+                        prediction = self.get_labels()[prediction]
+                        question_data['answers'].append({
+                            'idx':
+                            example.meta['answer_idx'],
+                            'label':
+                            prediction
+                        })
+                    passage_data['passage']['questions'].append(question_data)
+                output.write(json.dumps(passage_data) + '\n')
+
+    def get_classifier_input(self, example: InputExample, tokenizer):
+        text_a = example.text_a
+        text_b = ' '.join([example.text_b, 'answer:', example.meta['answer']])
+        return text_a, text_b
+
+
+class RaceProcessor(DataProcessor):
+
+    @property
+    def variable_num_choices(self):
+        return True
+
+    def get_labels(self):
+        return ['A', 'B', 'C', 'D']
+
+    def get_train_examples(self, data_dir):
+        return self._create_examples(os.path.join(data_dir, 'train'), 'train')
+
+    def get_dev_examples(self, data_dir, for_train=False):
+        return self._create_examples(
+            os.path.join(data_dir, 'dev'), 'dev', for_train=for_train)
+
+    def get_test_examples(self, data_dir):
+        return self._create_examples(os.path.join(data_dir, 'test'), 'test')
+
+    @staticmethod
+    def _create_examples(path,
+                         set_type,
+                         for_train=False) -> List[InputExample]:
+        examples = []
+
+        def clean_text(text):
+            """Remove new lines and multiple spaces and adjust end of sentence dot."""
+
+            text = text.replace('\n', ' ')
+            text = re.sub(r'\s+', ' ', text)
+            for _ in range(3):
+                text = text.replace(' . ', '. ')
+
+            return text
+
+        filenames = glob.glob(os.path.join(
+            path, 'middle', '*.txt')) + glob.glob(
+                os.path.join(path, 'high', '*.txt'))
+        for filename in filenames:
+            with open(filename, 'r') as f:
+                for line in f:
+                    data = json.loads(line)
+                    idx = data['id']
+                    context = data['article']
+                    questions = data['questions']
+                    choices = data['options']
+                    answers = data['answers']
+                    # Check the length.
+                    assert len(questions) == len(answers)
+                    assert len(questions) == len(choices)
+
+                    context = clean_text(context)
+                    for question_idx, question in enumerate(questions):
+                        answer = answers[question_idx]
+                        choice = choices[question_idx]
+                        guid = f'{set_type}-p{idx}-q{question_idx}'
+                        ex_idx = [set_type, idx, question_idx]
+                        meta = {'choices': choice}
+                        example = InputExample(
+                            guid=guid,
+                            text_a=context,
+                            text_b=question,
+                            label=answer,
+                            meta=meta,
+                            idx=ex_idx)
+                        examples.append(example)
+        return examples
+
+
+class RecordProcessor(SuperGLUEProcessor):
+    """Processor for the ReCoRD data set."""
+
+    def get_dev_examples(self, data_dir, for_train=False):
+        return self._create_examples(
+            os.path.join(data_dir, 'val.jsonl'), 'dev', for_train=for_train)
+
+    @property
+    def variable_num_choices(self):
+        return True
+
+    def get_labels(self):
+        return ['0', '1']
+
+    def output_prediction(self, predictions, examples, output_file):
+        with open(output_file, 'w') as output:
+            for prediction, example in zip(predictions, examples):
+                prediction = example.meta['candidates'][prediction]
+                data = {'idx': example.idx, 'label': prediction}
+                output.write(json.dumps(data) + '\n')
+
+    def encode(self, example: InputExample, tokenizer, seq_length, args):
+        if args.pretrained_bert:
+            ids_list, types_list, paddings_list = [], [], []
+        else:
+            ids_list, positions_list, sep_list = [], [], []
+        tokens_a = tokenizer.EncodeAsIds(example.text_a).tokenization
+        tokens_b = tokenizer.EncodeAsIds(
+            example.text_b).tokenization if example.text_b else None
+        for answer in example.meta['candidates']:
+            answer_ids = tokenizer.EncodeAsIds(answer).tokenization
+            total_length = len(tokens_a) + len(tokens_b) + len(answer_ids)
+            total_length += num_special_tokens_to_add(
+                tokens_a,
+                tokens_b + answer_ids,
+                None,
+                add_cls=True,
+                add_sep=True,
+                add_piece=False)
+            if total_length > seq_length:
+                self.num_truncated += 1
+            data = build_input_from_ids(
+                tokens_a,
+                tokens_b + answer_ids,
+                None,
+                seq_length,
+                tokenizer,
+                args,
+                add_cls=True,
+                add_sep=True,
+                add_piece=False)
+            ids, types, paddings, position_ids, sep, target_ids, loss_masks = data
+            if args.pretrained_bert:
+                ids_list.append(ids)
+                types_list.append(types)
+                paddings_list.append(paddings)
+            else:
+                ids_list.append(ids)
+                positions_list.append(position_ids)
+                sep_list.append(sep)
+        label = example.label
+        label = self.get_labels().index(label)
+        if args.pretrained_bert:
+            sample = build_sample(
+                ids_list,
+                label=label,
+                types=types_list,
+                paddings=paddings_list,
+                unique_id=example.guid)
+        else:
+            sample = build_sample(
+                ids_list,
+                positions=positions_list,
+                masks=sep_list,
+                label=label,
+                unique_id=example.guid)
+        return sample
+
+    @staticmethod
+    def _create_examples(path,
+                         set_type,
+                         seed=42,
+                         max_train_candidates_per_question: int = 10,
+                         for_train=False) -> List[InputExample]:
+        examples = []
+
+        entity_shuffler = random.Random(seed)
+
+        with open(path, encoding='utf8') as f:
+            for idx, line in enumerate(f):
+                example_json = json.loads(line)
+
+                idx = example_json['idx']
+                text = punctuation_standardization(
+                    example_json['passage']['text'])
+                entities = set()
+
+                for entity_json in example_json['passage']['entities']:
+                    start = entity_json['start']
+                    end = entity_json['end']
+                    entity = punctuation_standardization(text[start:end + 1])
+                    entities.add(entity)
+
+                entities = list(entities)
+                entities.sort()
+
+                text = text.replace(
+                    '@highlight\n', '- '
+                )  # we follow the GPT-3 paper wrt @highlight annotations
+                questions = example_json['qas']
+
+                for question_json in questions:
+                    question = punctuation_standardization(
+                        question_json['query'])
+                    question_idx = question_json['idx']
+                    answers = set()
+
+                    for answer_json in question_json.get('answers', []):
+                        answer = punctuation_standardization(
+                            answer_json['text'])
+                        answers.add(answer)
+
+                    answers = list(answers)
+
+                    if set_type == 'train' or for_train:
+                        # create a single example per *correct* answer
+                        for answer_idx, answer in enumerate(answers):
+                            candidates = [
+                                ent for ent in entities if ent not in answers
+                            ]
+                            if len(candidates
+                                   ) > max_train_candidates_per_question - 1:
+                                entity_shuffler.shuffle(candidates)
+                                candidates = candidates[:
+                                                        max_train_candidates_per_question
+                                                        - 1]
+
+                            guid = f'{set_type}-p{idx}-q{question_idx}-a{answer_idx}'
+                            meta = {
+                                'passage_idx': idx,
+                                'question_idx': question_idx,
+                                'candidates': [answer] + candidates,
+                                'answers': [answer]
+                            }
+                            ex_idx = [idx, question_idx, answer_idx]
+                            example = InputExample(
+                                guid=guid,
+                                text_a=text,
+                                text_b=question,
+                                label='0',
+                                meta=meta,
+                                idx=ex_idx,
+                                num_choices=len(candidates) + 1)
+                            examples.append(example)
+
+                    else:
+                        # create just one example with *all* correct answers and *all* answer candidates
+                        guid = f'{set_type}-p{idx}-q{question_idx}'
+                        meta = {
+                            'passage_idx': idx,
+                            'question_idx': question_idx,
+                            'candidates': entities,
+                            'answers': answers
+                        }
+                        example = InputExample(
+                            guid=guid,
+                            text_a=text,
+                            text_b=question,
+                            label='1',
+                            meta=meta,
+                            idx=question_idx,
+                            num_choices=len(entities))
+                        examples.append(example)
+
+        question_indices = list(
+            set(example.meta['question_idx'] for example in examples))
+        label_distribution = Counter(example.label for example in examples)
+        print_rank_0(
+            f'Returning {len(examples)} examples corresponding to {len(question_indices)} questions with label '
+            f'distribution {list(label_distribution.items())}')
+        return examples
+
+
+class MnliProcessor(DataProcessor):
+    """Processor for the MultiNLI data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        return self._create_examples(
+            os.path.join(data_dir, 'train.tsv'), 'train')
+
+    def get_dev_examples(self, data_dir, for_train=False):
+        return self._create_examples(
+            os.path.join(data_dir, 'dev_matched.tsv'), 'dev_matched')
+
+    def get_test_examples(self, data_dir) -> List[InputExample]:
+        return self._create_examples(
+            os.path.join(data_dir, 'test_matched.tsv'), 'test_matched')
+
+    def get_unlabeled_examples(self, data_dir) -> List[InputExample]:
+        return self.get_train_examples(data_dir)
+
+    def get_labels(self):
+        return ['contradiction', 'entailment', 'neutral']
+
+    @staticmethod
+    def _create_examples(path: str, set_type: str) -> List[InputExample]:
+        examples = []
+        df = read_tsv(path)
+
+        for idx, row in df.iterrows():
+            guid = f'{set_type}-{idx}'
+            text_a = punctuation_standardization(row['sentence1'])
+            text_b = punctuation_standardization(row['sentence2'])
+            label = row.get('gold_label', None)
+            example = InputExample(
+                guid=guid, text_a=text_a, text_b=text_b, label=label)
+            examples.append(example)
+
+        return examples
+
+
+class MnliMismatchedProcessor(MnliProcessor):
+    """Processor for the MultiNLI mismatched data set (GLUE version)."""
+
+    def get_dev_examples(self, data_dir, for_train=False):
+        return self._create_examples(
+            os.path.join(data_dir, 'dev_mismatched.tsv'), 'dev_mismatched')
+
+    def get_test_examples(self, data_dir) -> List[InputExample]:
+        return self._create_examples(
+            os.path.join(data_dir, 'test_mismatched.tsv'), 'test_mismatched')
+
+
+class AgnewsProcessor(DataProcessor):
+    """Processor for the AG news data set."""
+
+    def get_train_examples(self, data_dir):
+        return self._create_examples(
+            os.path.join(data_dir, 'train.csv'), 'train')
+
+    def get_dev_examples(self, data_dir, for_train=False):
+        return self._create_examples(os.path.join(data_dir, 'test.csv'), 'dev')
+
+    def get_test_examples(self, data_dir) -> List[InputExample]:
+        raise NotImplementedError()
+
+    def get_unlabeled_examples(self, data_dir) -> List[InputExample]:
+        return self.get_train_examples(data_dir)
+
+    def get_labels(self):
+        return ['1', '2', '3', '4']
+
+    @staticmethod
+    def _create_examples(path: str, set_type: str) -> List[InputExample]:
+        examples = []
+
+        with open(path) as f:
+            reader = csv.reader(f, delimiter=',')
+            for idx, row in enumerate(reader):
+                label, headline, body = row
+                guid = '%s-%s' % (set_type, idx)
+                text_a = punctuation_standardization(
+                    headline.replace('\\', ' '))
+                text_b = punctuation_standardization(body.replace('\\', ' '))
+
+                example = InputExample(
+                    guid=guid, text_a=text_a, text_b=text_b, label=label)
+                examples.append(example)
+
+        return examples
+
+
+class YahooAnswersProcessor(DataProcessor):
+    """Processor for the Yahoo Answers data set."""
+
+    def get_train_examples(self, data_dir):
+        return self._create_examples(
+            os.path.join(data_dir, 'train.csv'), 'train')
+
+    def get_dev_examples(self, data_dir, for_train=False):
+        return self._create_examples(os.path.join(data_dir, 'test.csv'), 'dev')
+
+    def get_test_examples(self, data_dir) -> List[InputExample]:
+        raise NotImplementedError()
+
+    def get_unlabeled_examples(self, data_dir) -> List[InputExample]:
+        return self.get_train_examples(data_dir)
+
+    def get_labels(self):
+        return ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']
+
+    @staticmethod
+    def _create_examples(path: str, set_type: str) -> List[InputExample]:
+        examples = []
+
+        with open(path, encoding='utf8') as f:
+            reader = csv.reader(f, delimiter=',')
+            for idx, row in enumerate(reader):
+                label, question_title, question_body, answer = row
+                guid = '%s-%s' % (set_type, idx)
+                text_a = ' '.join([
+                    question_title.replace('\\n', ' ').replace('\\', ' '),
+                    question_body.replace('\\n', ' ').replace('\\', ' ')
+                ])
+                text_a = punctuation_standardization(text_a)
+                text_b = answer.replace('\\n', ' ').replace('\\', ' ')
+                text_b = punctuation_standardization(text_b)
+
+                example = InputExample(
+                    guid=guid, text_a=text_a, text_b=text_b, label=label)
+                examples.append(example)
+
+        return examples
+
+
+class YelpPolarityProcessor(DataProcessor):
+    """Processor for the YELP binary classification set."""
+
+    def get_train_examples(self, data_dir):
+        return self._create_examples(
+            os.path.join(data_dir, 'train.csv'), 'train')
+
+    def get_dev_examples(self, data_dir, for_train=False):
+        return self._create_examples(os.path.join(data_dir, 'test.csv'), 'dev')
+
+    def get_test_examples(self, data_dir) -> List[InputExample]:
+        raise NotImplementedError()
+
+    def get_unlabeled_examples(self, data_dir) -> List[InputExample]:
+        return self.get_train_examples(data_dir)
+
+    def get_labels(self):
+        return ['1', '2']
+
+    @staticmethod
+    def _create_examples(path: str, set_type: str) -> List[InputExample]:
+        examples = []
+
+        with open(path) as f:
+            reader = csv.reader(f, delimiter=',')
+            for idx, row in enumerate(reader):
+                label, body = row
+                guid = '%s-%s' % (set_type, idx)
+                text_a = body.replace('\\n', ' ').replace('\\', ' ')
+                text_a = punctuation_standardization(text_a)
+
+                example = InputExample(guid=guid, text_a=text_a, label=label)
+                examples.append(example)
+
+        return examples
+
+
+class YelpFullProcessor(YelpPolarityProcessor):
+    """Processor for the YELP full classification set."""
+
+    def get_test_examples(self, data_dir) -> List[InputExample]:
+        raise NotImplementedError()
+
+    def get_labels(self):
+        return ['1', '2', '3', '4', '5']
+
+
+class XStanceProcessor(DataProcessor):
+    """Processor for the X-Stance data set."""
+
+    def __init__(self, args, language: str = None):
+        super().__init__(args)
+        if language is not None:
+            assert language in ['de', 'fr']
+        self.language = language
+
+    def get_train_examples(self, data_dir):
+        return self._create_examples(os.path.join(data_dir, 'train.jsonl'))
+
+    def get_dev_examples(self, data_dir, for_train=False):
+        return self._create_examples(os.path.join(data_dir, 'test.jsonl'))
+
+    def get_test_examples(self, data_dir) -> List[InputExample]:
+        raise NotImplementedError()
+
+    def get_unlabeled_examples(self, data_dir) -> List[InputExample]:
+        return self.get_train_examples(data_dir)
+
+    def get_labels(self):
+        return ['FAVOR', 'AGAINST']
+
+    def _create_examples(self, path: str) -> List[InputExample]:
+        examples = []
+
+        with open(path, encoding='utf8') as f:
+            for line in f:
+                example_json = json.loads(line)
+                label = example_json['label']
+                id_ = example_json['id']
+                text_a = punctuation_standardization(example_json['question'])
+                text_b = punctuation_standardization(example_json['comment'])
+                language = example_json['language']
+
+                if self.language is not None and language != self.language:
+                    continue
+
+                example = InputExample(
+                    guid=id_, text_a=text_a, text_b=text_b, label=label)
+                examples.append(example)
+
+        return examples
+
+
+class Sst2Processor(DataProcessor):
+
+    def get_train_examples(self, data_dir):
+        return self._create_examples(
+            os.path.join(data_dir, 'train.tsv'), 'train')
+
+    def get_dev_examples(self, data_dir, for_train=False):
+        return self._create_examples(os.path.join(data_dir, 'dev.tsv'), 'dev')
+
+    def get_test_examples(self, data_dir) -> List[InputExample]:
+        return self._create_examples(
+            os.path.join(data_dir, 'test.tsv'), 'test')
+
+    def get_labels(self):
+        return ['0', '1']
+
+    @staticmethod
+    def _create_examples(path: str, set_type: str) -> List[InputExample]:
+        examples = []
+        df = read_tsv(path)
+
+        for idx, row in df.iterrows():
+            guid = f'{set_type}-{idx}'
+            text_a = punctuation_standardization(row['sentence'])
+            label = row.get('label', None)
+            example = InputExample(guid=guid, text_a=text_a, label=label)
+            examples.append(example)
+
+        return examples
+
+
+class ColaProcessor(Sst2Processor):
+
+    def get_labels(self):
+        return ['0', '1']
+
+    @staticmethod
+    def _create_examples(path: str, set_type: str) -> List[InputExample]:
+        examples = []
+        if set_type != 'test':
+            df = read_tsv(path, header=None)
+        else:
+            df = read_tsv(path)
+
+        for idx, row in df.iterrows():
+            guid = f'{set_type}-{idx}'
+            if set_type != 'test':
+                text_a = punctuation_standardization(row[3])
+                label = row[1]
+            else:
+                text_a = punctuation_standardization(row['sentence'])
+                label = None
+            example = InputExample(guid=guid, text_a=text_a, label=label)
+            examples.append(example)
+
+        return examples
+
+
+class MrpcProcessor(Sst2Processor):
+
+    def get_labels(self):
+        return ['0', '1']
+
+    @staticmethod
+    def _create_examples(path: str, set_type: str) -> List[InputExample]:
+        examples = []
+        df = read_tsv(path)
+
+        for idx, row in df.iterrows():
+            guid = f'{set_type}-{idx}'
+            text_a = punctuation_standardization(row['#1 String'])
+            text_b = punctuation_standardization(row['#2 String'])
+            label = row.get('Quality', None)
+            example = InputExample(
+                guid=guid, text_a=text_a, text_b=text_b, label=label)
+            examples.append(example)
+
+        return examples
+
+
+class QqpProcessor(Sst2Processor):
+
+    def get_labels(self):
+        return ['0', '1']
+
+    @staticmethod
+    def _create_examples(path: str, set_type: str) -> List[InputExample]:
+        examples = []
+        df = read_tsv(path)
+
+        for idx, row in df.iterrows():
+            guid = f'{set_type}-{idx}'
+            text_a = punctuation_standardization(row['question1'])
+            text_b = punctuation_standardization(row['question2'])
+            label = row.get('is_duplicate', None)
+            example = InputExample(
+                guid=guid, text_a=text_a, text_b=text_b, label=label)
+            examples.append(example)
+
+        return examples
+
+
+class QnliProcessor(Sst2Processor):
+
+    def get_labels(self):
+        return ['entailment', 'not_entailment']
+
+    @staticmethod
+    def _create_examples(path: str, set_type: str) -> List[InputExample]:
+        examples = []
+        df = read_tsv(path)
+
+        for idx, row in df.iterrows():
+            guid = f'{set_type}-{idx}'
+            text_a = punctuation_standardization(row['question'])
+            text_b = punctuation_standardization(row['sentence'])
+            label = row.get('label', None)
+            example = InputExample(
+                guid=guid, text_a=text_a, text_b=text_b, label=label)
+            examples.append(example)
+
+        return examples
+
+
+class SquadProcessor(DataProcessor):
+
+    def get_train_examples(self, data_dir):
+        return self._create_examples(
+            os.path.join(data_dir, 'train-v2.0.json'), 'train')
+
+    def get_dev_examples(self, data_dir, for_train=False):
+        return self._create_examples(
+            os.path.join(data_dir, 'dev-v2.0.json'), 'dev')
+
+    def get_labels(self):
+        return ['0']
+
+    @staticmethod
+    def _create_examples(path: str, set_type: str) -> List[InputExample]:
+        examples = []
+        with open(path) as f:
+            data = json.load(f)['data']
+
+        for idx, passage in enumerate(data):
+            for pid, paragraph in enumerate(passage['paragraphs']):
+                context = paragraph['context']
+                for qid, qas in enumerate(paragraph['qas']):
+                    if len(qas['answers']) == 0:
+                        continue
+                    guid = f'{set_type}-{idx}-{pid}-{qid}'
+                    example = InputExample(
+                        guid=guid,
+                        text_a=context,
+                        text_b=qas['question'],
+                        label='0',
+                        meta={'answer': qas['answers'][0]})
+                    examples.append(example)
+
+        return examples
+
+
+CLASSIFICATION_DATASETS = {'wic', 'rte', 'cb', 'boolq', 'multirc', 'wsc'}
+MULTI_CHOICE_DATASETS = {'copa', 'record'}
+
+PROCESSORS = {
+    'mnli': MnliProcessor,
+    'mnli-mm': MnliMismatchedProcessor,
+    'agnews': AgnewsProcessor,
+    'yahoo': YahooAnswersProcessor,
+    'yelp-polarity': YelpPolarityProcessor,
+    'yelp-full': YelpFullProcessor,
+    'xstance-de': lambda: XStanceProcessor('de'),
+    'xstance-fr': lambda: XStanceProcessor('fr'),
+    'xstance': XStanceProcessor,
+    'wic': WicProcessor,
+    'rte': RteProcessor,
+    'cb': CbProcessor,
+    'wsc': WscProcessor,
+    'wsc1': WscProcessor,
+    'boolq': BoolQProcessor,
+    'copa': CopaProcessor,
+    'multirc': MultiRcProcessor,
+    'record': RecordProcessor,
+    'ax-g': AxGProcessor,
+    'ax-b': AxBProcessor,
+    'sst2': Sst2Processor,
+    'cola': ColaProcessor,
+    'mrpc': MrpcProcessor,
+    'qqp': QqpProcessor,
+    'qnli': QnliProcessor,
+    'squad': SquadProcessor,
+    'race': RaceProcessor,
+    'squad': SquadProcessor
+}  # type: Dict[str,Callable[[1],DataProcessor]]
diff --git a/modelscope/models/nlp/mglm/tasks/superglue/evaluate.py b/modelscope/models/nlp/mglm/tasks/superglue/evaluate.py
new file mode 100644
index 00000000..145fb45b
--- /dev/null
+++ b/modelscope/models/nlp/mglm/tasks/superglue/evaluate.py
@@ -0,0 +1,101 @@
+# Copyright (c) 2022 Zhipu.AI
+"""
+Official evaluation script for ReCoRD v1.0.
+(Some functions are adopted from the SQuAD evaluation script.)
+"""
+
+from __future__ import print_function
+import functools
+import re
+import string
+from collections import Counter, defaultdict
+from typing import List
+
+from tasks.data_utils import InputExample
+
+
+def normalize_answer(s):
+    """Lower text and remove punctuation, articles and extra whitespace."""
+
+    def remove_articles(text):
+        return re.sub(r'\b(a|an|the)\b', ' ', text)
+
+    def white_space_fix(text):
+        return ' '.join(text.split())
+
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return ''.join(ch for ch in text if ch not in exclude)
+
+    def lower(text):
+        return text.lower()
+
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+
+def f1_score(prediction, ground_truth):
+    prediction_tokens = normalize_answer(prediction).split()
+    ground_truth_tokens = normalize_answer(ground_truth).split()
+    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
+    num_same = sum(common.values())
+    if num_same == 0:
+        return 0
+    precision = 1.0 * num_same / len(prediction_tokens)
+    recall = 1.0 * num_same / len(ground_truth_tokens)
+    f1 = (2 * precision * recall) / (precision + recall)
+    return f1
+
+
+def exact_match_score(prediction, ground_truth):
+    return normalize_answer(prediction) == normalize_answer(ground_truth)
+
+
+def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
+    if not ground_truths:
+        return 0.0
+    scores_for_ground_truths = []
+    for ground_truth in ground_truths:
+        score = metric_fn(prediction, ground_truth)
+        scores_for_ground_truths.append(score)
+    return max(scores_for_ground_truths)
+
+
+def qa_evaluate(predictions, labels, examples: List[InputExample], metric):
+    assert len(examples) == len(predictions)
+    score = 0.0
+    for example, prediction in zip(examples, predictions):
+        ground_truths = example.meta['answers']
+        prediction = example.meta['candidates'][prediction]
+        if ground_truths:
+            score += metric_max_over_ground_truths(metric, prediction,
+                                                   ground_truths)
+    score = 100.0 * score / len(predictions)
+    return score
+
+
+def multirc_em(predictions, labels, examples: List[InputExample]):
+    """Compute the exact match (EM) for a sequence of predictions and actual labels"""
+    question_ids = [example.meta['question_idx'] for example in examples]
+    unique_questions = set(question_ids)
+
+    q_actuals = list(zip(question_ids, labels))
+    q_predictions = list(zip(question_ids, predictions))
+
+    actuals_per_question = defaultdict(list)
+    predictions_per_question = defaultdict(list)
+
+    for qid, val in q_actuals:
+        actuals_per_question[qid].append(val)
+    for qid, val in q_predictions:
+        predictions_per_question[qid].append(val)
+
+    em = 0
+    for qid in unique_questions:
+        if actuals_per_question[qid] == predictions_per_question[qid]:
+            em += 1
+    em /= len(unique_questions)
+    return em
+
+
+qa_exact_match = functools.partial(qa_evaluate, metric=exact_match_score)
+qa_f1 = functools.partial(qa_evaluate, metric=f1_score)
diff --git a/modelscope/models/nlp/mglm/tasks/superglue/finetune.py b/modelscope/models/nlp/mglm/tasks/superglue/finetune.py
new file mode 100644
index 00000000..371705ff
--- /dev/null
+++ b/modelscope/models/nlp/mglm/tasks/superglue/finetune.py
@@ -0,0 +1,138 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Race."""
+
+from collections import OrderedDict
+
+from finetune_glm import finetune
+from tasks.eval_utils import (accuracy_func_provider, accuracy_metric,
+                              f1_macro_metric, f1_metric)
+from tasks.superglue.dataset import (CLASSIFICATION_DATASETS,
+                                     MULTI_CHOICE_DATASETS, PROCESSORS,
+                                     SuperGlueDataset, get_output_func)
+from tasks.superglue.evaluate import multirc_em, qa_exact_match, qa_f1
+from tasks.superglue.pvp import PVPS
+
+DEFAULT_METRICS = {
+    'record': [('EM', qa_exact_match), ('F1', qa_f1)],
+    'copa': [('accuracy', accuracy_metric)],
+    'rte': [('accuracy', accuracy_metric)],
+    'boolq': [('accuracy', accuracy_metric)],
+    'wic': [('accuracy', accuracy_metric)],
+    'wsc': [('accuracy', accuracy_metric)],
+    'cb': [('accuracy', accuracy_metric), ('f1-macro', f1_macro_metric)],
+    'multirc': [('f1a', f1_metric), ('em', multirc_em),
+                ('acc', accuracy_metric)],
+    'mnli': [('accuracy', accuracy_metric)],
+    'sst2': [('accuracy', accuracy_metric)],
+    'qnli': [('accuracy', accuracy_metric)],
+    'qqp': [('accuracy', accuracy_metric)],
+    'mrpc': [('accuracy', accuracy_metric)],
+    'cola': [('accuracy', accuracy_metric)],
+    'squad': [('accuracy', accuracy_metric)],
+}
+
+
+def train_valid_datasets_provider(args, tokenizer, pattern_text=False):
+    """Provide train and validation datasets."""
+    task_name = args.task.lower()
+    data_dir = args.data_dir
+    train_dataset = SuperGlueDataset(
+        args,
+        task_name,
+        data_dir,
+        args.seq_length,
+        'train',
+        tokenizer,
+        pattern_text=pattern_text)
+    valid_dataset = SuperGlueDataset(
+        args,
+        task_name,
+        data_dir,
+        args.seq_length,
+        'dev',
+        tokenizer,
+        for_train=True,
+        pattern_text=pattern_text)
+
+    return train_dataset, valid_dataset
+
+
+def metrics_func_provider(args, tokenizer, is_test):
+    """Privde metrics callback function."""
+
+    def single_dataset_provider(split):
+        return SuperGlueDataset(args, args.task.lower(), args.data_dir,
+                                args.seq_length, split, tokenizer)
+
+    output_func = get_output_func(args.task.lower(), args)
+    eval_func = None
+    if args.task.lower() in ['wsc', 'squad'
+                             ] and args.cloze_eval and not args.wsc_negative:
+        from tasks.language_model.finetune import classify_evaluate
+        eval_func = classify_evaluate
+    metric_dict = OrderedDict(DEFAULT_METRICS[args.task.lower()])
+    return accuracy_func_provider(
+        single_dataset_provider,
+        metric_dict,
+        args,
+        is_test=is_test,
+        eval_func=eval_func,
+        output_func=output_func,
+        only_rank0=False,
+        tokenizer=tokenizer)
+
+
+def main(args):
+    model_kwargs = {}
+    processor = PROCESSORS[args.task.lower()](args)
+    pvp = PVPS[args.task.lower()](
+        args,
+        None,
+        processor.get_labels(),
+        args.seq_length,
+        pattern_id=args.pattern_id,
+        is_multi_token=args.multi_token,
+        num_prompt_tokens=args.num_prompt_tokens)
+    if args.continuous_prompt:
+        model_kwargs['spell_length'] = pvp.spell_length
+    if args.task.lower() in ['wsc', 'squad'
+                             ] and args.cloze_eval and not args.wsc_negative:
+        from tasks.language_model.finetune import lm_forward_step
+        finetune(
+            args,
+            train_valid_datasets_provider,
+            model_kwargs,
+            end_of_epoch_callback_provider=metrics_func_provider,
+            forward_step=lm_forward_step)
+    else:
+        if args.cloze_eval:
+            multi_token = pvp.is_multi_token
+        else:
+            multi_token = args.task.lower() in MULTI_CHOICE_DATASETS
+        args.multi_token = multi_token
+        if not multi_token:
+            model_kwargs[
+                'model_type'] = 'multiple_choice' if args.cloze_eval else 'classification'
+            model_kwargs['multi_token'] = False
+            model_kwargs['num_labels'] = len(processor.get_labels())
+        else:
+            model_kwargs['model_type'] = 'multiple_choice'
+            model_kwargs['multi_token'] = True
+            model_kwargs['num_labels'] = 1
+        finetune(
+            args,
+            train_valid_datasets_provider,
+            model_kwargs,
+            end_of_epoch_callback_provider=metrics_func_provider)
diff --git a/modelscope/models/nlp/mglm/tasks/superglue/pvp.py b/modelscope/models/nlp/mglm/tasks/superglue/pvp.py
new file mode 100644
index 00000000..ff394172
--- /dev/null
+++ b/modelscope/models/nlp/mglm/tasks/superglue/pvp.py
@@ -0,0 +1,1541 @@
+# Copyright (c) 2022 Zhipu.AI
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This file contains the pattern-verbalizer pairs (PVPs) for all tasks.
+"""
+import copy
+import math
+import random
+import string
+from abc import ABC, abstractmethod
+from collections import defaultdict
+from typing import Dict, List, Tuple, Union
+
+import numpy as np
+from tasks.data_utils import (InputExample, build_decoder_input,
+                              build_decoder_sample, build_input_from_ids,
+                              build_sample, num_special_tokens_to_add)
+from utils import print_rank_0
+
+FilledPattern = Tuple[List[Union[str, Tuple[str, bool]]],
+                      List[Union[str, Tuple[str, bool]]]]
+
+
+class PVP(ABC):
+    """
+    This class contains functions to apply patterns and verbalizers as required by PET. Each task requires its own
+    custom implementation of a PVP.
+    """
+
+    def __init__(self,
+                 args,
+                 tokenizer,
+                 label_list,
+                 max_seq_length,
+                 pattern_id: int = 0,
+                 verbalizer_file: str = None,
+                 seed: int = 42,
+                 is_multi_token=False,
+                 max_segment_length=0,
+                 fast_decode: bool = False,
+                 split='train',
+                 num_prompt_tokens=0):
+        """
+        Create a new PVP.
+
+        :param args: the args
+        :param tokenizer: the tokenizer
+        :param label_list: the list of labels
+        :param max_seq_length: the maximum length of the sequence
+        :param pattern_id: the pattern id to use
+        :param seed: a seed to be used for generating random numbers if necessary
+        :param is_multi_token: if the verbalizers contain multiple tokens
+        :param fast_decode: whether to use the fast decode mode for multi-token tasks
+        :param continuous_prompt: whether to use continuous prompt optimization
+        """
+        self.args = args
+        self.tokenizer = tokenizer
+        self.label_list = label_list
+        self.max_seq_length = max_seq_length
+        self.pattern_id = pattern_id
+        self.num_prompt_tokens = num_prompt_tokens
+        self.rng = random.Random(seed)
+        self.num_truncated = 0
+        self.fast_decode = fast_decode
+        self.split = split
+        self.max_dec_seq_length = 16
+        self._is_multi_token = is_multi_token
+        self.max_segment_length = max_segment_length
+        self.task_mask = args.task_mask
+        self.continuous_prompt = args.continuous_prompt
+        self.prefix_prompt = args.prefix_prompt
+        if self.continuous_prompt:
+            print_rank_0(
+                f'Prompt tokens in pvp {self.num_prompt_tokens} spell length {self.spell_length}'
+            )
+
+        if verbalizer_file:
+            self.verbalize = PVP._load_verbalizer_from_file(
+                verbalizer_file, self.pattern_id)
+
+    @property
+    def is_multi_token(self):
+        return self._is_multi_token
+
+    @property
+    def spell_length(self):
+        return 0
+
+    @property
+    def mask(self) -> str:
+        """Return the underlying LM's mask token"""
+        return self.tokenizer.get_command('MASK').Id
+
+    @property
+    def mask_id(self) -> int:
+        """Return the underlying LM's mask id"""
+        return self.tokenizer.get_command('MASK').Id
+
+    @property
+    def max_num_verbalizers(self) -> int:
+        """Return the maximum number of verbalizers across all labels"""
+        return max(len(self.verbalize(label)) for label in self.label_list)
+
+    @staticmethod
+    def shortenable(s):
+        """Return an instance of this string that is marked as shortenable"""
+        return s, True
+
+    @staticmethod
+    def remove_final_punc(s: Union[str, Tuple[str, bool]]):
+        """Remove the final punctuation mark"""
+        if isinstance(s, tuple):
+            return PVP.remove_final_punc(s[0]), s[1]
+        return s.rstrip(string.punctuation)
+
+    @staticmethod
+    def lowercase_first(s: Union[str, Tuple[str, bool]]):
+        """Lowercase the first character"""
+        if isinstance(s, tuple):
+            return PVP.lowercase_first(s[0]), s[1]
+        return s[0].lower() + s[1:]
+
+    @staticmethod
+    def uppercase_first(s: Union[str, Tuple[str, bool]]):
+        """Lowercase the first character"""
+        if isinstance(s, tuple):
+            return PVP.uppercase_first(s[0]), s[1]
+        return s[0].upper() + s[1:]
+
+    @staticmethod
+    def available_patterns():
+        return [0]
+
+    def replace_prompt_tokens(self, parts_a, parts_b):
+        if not self.continuous_prompt:
+            parts_a = [part for part in parts_a if part is not None]
+            parts_b = [part for part in parts_b if part is not None]
+            return parts_a, parts_b
+        num_prompt_tokens = self.num_prompt_tokens
+        num_pos = 0
+        for parts in (parts_a, parts_b):
+            for part in parts:
+                if part is None:
+                    num_pos += 1
+        avg_prompt_tokens = math.ceil(num_prompt_tokens / num_pos)
+        new_parts_a, new_parts_b = [], []
+        for part in parts_a:
+            if part is None:
+                if num_prompt_tokens > 0:
+                    if num_prompt_tokens >= avg_prompt_tokens:
+                        new_parts_a.append(avg_prompt_tokens)
+                        num_prompt_tokens -= avg_prompt_tokens
+                    else:
+                        new_parts_a.append(num_prompt_tokens)
+                        num_prompt_tokens = 0
+            else:
+                new_parts_a.append(part)
+        for part in parts_b:
+            if part is None:
+                if num_prompt_tokens > 0:
+                    if num_prompt_tokens >= avg_prompt_tokens:
+                        new_parts_b.append(avg_prompt_tokens)
+                        num_prompt_tokens -= avg_prompt_tokens
+                    else:
+                        new_parts_b.append(num_prompt_tokens)
+                        num_prompt_tokens = 0
+            else:
+                new_parts_b.append(part)
+        return new_parts_a, new_parts_b
+
+    def encode(self,
+               example: InputExample,
+               priming: bool = False,
+               labeled: bool = False):
+        """
+        Encode an input example using this pattern-verbalizer pair.
+
+        :param example: the input example to encode
+        :param priming: whether to use this example for priming
+        :param labeled: if ``priming=True``, whether the label should be appended to this example
+        :return: A tuple, consisting of a list of input ids and a list of token type ids
+        """
+
+        if not priming:
+            assert not labeled, "'labeled' can only be set to true if 'priming' is also set to true"
+
+        tokenizer = self.tokenizer
+        raw_parts_a, raw_parts_b = self.get_parts(example)
+
+        raw_parts_a = [
+            x if isinstance(x, tuple) else (x, False) for x in raw_parts_a
+        ]
+        prompt_id = tokenizer.num_tokens
+
+        def encode_input(raw_parts):
+            parts = []
+            for x, s in raw_parts:
+                if isinstance(x, str):
+                    x = tokenizer.EncodeAsIds(x)
+                elif isinstance(x, int):
+                    x = [prompt_id] * x
+                else:
+                    pass
+                parts.append((x, s))
+            return parts
+
+        parts_a = encode_input(raw_parts_a)
+        if self.prefix_prompt > 0:
+            parts_a = [([prompt_id] * self.prefix_prompt, False)] + parts_a
+
+        parts_b = None
+        if raw_parts_b:
+            raw_parts_b = [
+                x if isinstance(x, tuple) else (x, False) for x in raw_parts_b
+            ]
+            parts_b = encode_input(raw_parts_b)
+
+        if self.is_multi_token:
+            answers = self.get_answers(example)
+            if example.label is not None:
+                label = self.label_list.index(example.label)
+            else:
+                label = 0
+
+            if not self.fast_decode:
+                ids_list, positions_list, sep_list, mask_list, target_list, prompt_list = [], [], [], [], [], []
+                segment_id_list = []
+                if priming:
+                    answer = answers[label]
+                    answer_ids = get_verbalization_ids(
+                        answer, tokenizer, force_single_token=False)
+                    self.num_truncated += self.truncate(
+                        parts_a,
+                        parts_b,
+                        answer_ids,
+                        max_length=self.max_seq_length)
+                    tokens_a = [
+                        token_id for part, _ in parts_a for token_id in part
+                    ]
+                    tokens_b = [
+                        token_id for part, _ in parts_b for token_id in part
+                    ] if parts_b else None
+                    input_ids = tokens_a
+                    if tokens_b:
+                        input_ids += tokens_b
+                    if labeled:
+                        mask_idx = input_ids.index(self.mask_id)
+                        input_ids = input_ids[:
+                                              mask_idx] + answer_ids + input_ids[
+                                                  mask_idx + 1:]
+                    return input_ids
+                else:
+                    for idx, answer in enumerate(answers):
+                        this_parts_a, this_parts_b = copy.deepcopy(
+                            parts_a), copy.deepcopy(parts_b)
+                        answer_ids = get_verbalization_ids(
+                            answer, tokenizer, force_single_token=False)
+                        answer_ids = answer_ids + [
+                            tokenizer.get_command('eop').Id
+                        ]
+                        self.num_truncated += self.truncate(
+                            this_parts_a,
+                            this_parts_b,
+                            answer_ids,
+                            max_length=self.max_seq_length)
+                        tokens_a = [
+                            token_id for part, _ in this_parts_a
+                            for token_id in part
+                        ]
+                        tokens_b = [
+                            token_id for part, _ in this_parts_b
+                            for token_id in part
+                        ] if parts_b else None
+                        if self.max_segment_length > 0:
+                            num_segments = (len(answer_ids)
+                                            - 1) // self.max_segment_length + 1
+                            segments = [
+                                answer_ids[index
+                                           * self.max_segment_length:(index
+                                                                      + 1)
+                                           * self.max_segment_length]
+                                for index in range(num_segments)
+                            ]
+                            segment_id_list += [idx] * len(segments)
+                        else:
+                            segments = [answer_ids]
+                        for segment in segments:
+                            data = build_input_from_ids(
+                                tokens_a,
+                                tokens_b,
+                                segment,
+                                self.max_seq_length,
+                                self.tokenizer,
+                                args=self.args,
+                                add_cls=True,
+                                add_sep=False,
+                                add_piece=True,
+                                mask_id=self.mask_id)
+                            ids, types, paddings, position_ids, sep, target_ids, loss_masks = data
+                            prompt_pos = [
+                                idx for idx, token in enumerate(ids)
+                                if token == prompt_id
+                            ]
+                            ids = [
+                                idx if idx != prompt_id else 0 for idx in ids
+                            ]
+                            prompt_list.append(prompt_pos)
+                            ids_list.append(ids)
+                            positions_list.append(position_ids)
+                            sep_list.append(sep)
+                            target_list.append(target_ids)
+                            mask_list.append(loss_masks)
+                            if self.mask in tokens_a:
+                                mask_pos = tokens_a.index(self.mask)
+                                tokens_a = tokens_a[:
+                                                    mask_pos] + segment + tokens_a[
+                                                        mask_pos:]
+                            else:
+                                mask_pos = tokens_b.index(self.mask)
+                                tokens_b = tokens_b[:
+                                                    mask_pos] + segment + tokens_b[
+                                                        mask_pos:]
+                    segment_id_list = segment_id_list if segment_id_list else None
+                    sample = build_sample(
+                        ids_list,
+                        positions=positions_list,
+                        masks=sep_list,
+                        label=label,
+                        logit_mask=mask_list,
+                        target=target_list,
+                        unique_id=example.guid,
+                        segment_ids=segment_id_list,
+                        prompt_ids=prompt_list)
+                    return sample
+            else:
+                this_parts_a, this_parts_b = copy.deepcopy(
+                    parts_a), copy.deepcopy(parts_b)
+                self.num_truncated += self.truncate(
+                    this_parts_a,
+                    this_parts_b,
+                    None,
+                    max_length=self.max_seq_length)
+                tokens_a = [
+                    token_id for part, _ in this_parts_a for token_id in part
+                ]
+                tokens_b = [
+                    token_id for part, _ in this_parts_b for token_id in part
+                ] if parts_b else None
+                data = build_input_from_ids(
+                    tokens_a,
+                    tokens_b,
+                    None,
+                    self.max_seq_length,
+                    self.tokenizer,
+                    args=self.args,
+                    add_cls=True,
+                    add_sep=False,
+                    add_piece=False)
+                ids, types, paddings, position_ids, sep, target_ids, loss_masks = data
+                sample = build_sample(
+                    ids,
+                    positions=position_ids,
+                    masks=sep,
+                    label=label,
+                    unique_id=example.guid)
+
+                ids_list, positions_list, mask_list, target_list, logit_mask_list = [], [], [], [], []
+                for answer in answers:
+                    answer_ids = get_verbalization_ids(
+                        answer, tokenizer, force_single_token=False)
+                    answer_ids = answer_ids + [tokenizer.get_command('eop').Id]
+                    answer_ids = answer_ids[:self.max_dec_seq_length]
+                    data = build_decoder_input(ids, answer_ids,
+                                               self.max_seq_length,
+                                               self.max_dec_seq_length,
+                                               tokenizer)
+                    dec_ids, _, _, dec_position_ids, _, dec_target_ids, dec_loss_masks = data
+                    ids_list.append(dec_ids)
+                    positions_list.append(dec_position_ids)
+                    mask_list.append(sep)
+                    target_list.append(dec_target_ids)
+                    logit_mask_list.append(dec_loss_masks)
+
+                sample = build_decoder_sample(sample, ids_list, positions_list,
+                                              mask_list, target_list,
+                                              logit_mask_list)
+                return sample
+
+        else:
+            self.num_truncated += self.truncate(
+                parts_a, parts_b, [], max_length=self.max_seq_length)
+
+            tokens_a = [token_id for part, _ in parts_a for token_id in part]
+            tokens_b = [token_id for part, _ in parts_b
+                        for token_id in part] if parts_b else None
+            if priming:
+                input_ids = tokens_a
+                if tokens_b:
+                    input_ids += tokens_b
+                if labeled:
+                    mask_idx = input_ids.index(self.mask_id)
+                    verbalizer = self.verbalize(example.label)
+                    assert len(
+                        verbalizer
+                    ) == 1, 'priming only supports one verbalization per label'
+                    verbalizer = verbalizer[0]
+                    verbalizer_id = get_verbalization_ids(
+                        verbalizer, self.tokenizer, force_single_token=True)
+                    input_ids[mask_idx] = verbalizer_id
+                return input_ids
+            data = build_input_from_ids(
+                tokens_a,
+                tokens_b,
+                None,
+                self.max_seq_length,
+                self.tokenizer,
+                args=self.args,
+                add_cls=True,
+                add_sep=False,
+                add_piece=True)
+            ids, types, paddings, position_ids, sep, target_ids, loss_masks = data
+            prompt_pos = [
+                idx for idx, token in enumerate(ids) if token == prompt_id
+            ]
+            ids = [token if token != prompt_id else 0 for token in ids]
+            target_ids = self.get_verbalizer_ids()
+            if example.label is not None:
+                label = self.label_list.index(example.label)
+            else:
+                label = 0
+            sample = build_sample(
+                ids=ids,
+                positions=position_ids,
+                target=target_ids,
+                masks=sep,
+                logit_mask=loss_masks,
+                label=label,
+                unique_id=example.guid,
+                prompt_ids=prompt_pos)
+            return sample
+
+    @staticmethod
+    def _seq_length(parts: List[Tuple[List[int], bool]],
+                    only_shortenable: bool = False):
+        return sum([
+            len(x) for x, shortenable in parts
+            if not only_shortenable or shortenable
+        ]) if parts else 0
+
+    @staticmethod
+    def _remove_last(parts: List[Tuple[List[int], bool]]):
+        last_idx = max(idx for idx, (seq, shortenable) in enumerate(parts)
+                       if shortenable and seq)
+        parts[last_idx] = (parts[last_idx][0][:-1], parts[last_idx][1])
+
+    def truncate(self, parts_a: List[Tuple[List[int], bool]],
+                 parts_b: List[Tuple[List[int], bool]], answer: List[int],
+                 max_length: int):
+        """Truncate two sequences of text to a predefined total maximum length"""
+        total_len = self._seq_length(parts_a) + self._seq_length(parts_b)
+        if answer:
+            total_len += len(answer)
+        total_len += num_special_tokens_to_add(
+            parts_a,
+            parts_b,
+            answer,
+            add_cls=True,
+            add_sep=False,
+            add_piece=True)
+        num_tokens_to_remove = total_len - max_length
+
+        if num_tokens_to_remove <= 0:
+            return False
+
+        for _ in range(num_tokens_to_remove):
+            if self._seq_length(
+                    parts_a, only_shortenable=True) > self._seq_length(
+                        parts_b, only_shortenable=True):
+                self._remove_last(parts_a)
+            else:
+                self._remove_last(parts_b)
+        return True
+
+    @abstractmethod
+    def get_parts(self, example: InputExample) -> FilledPattern:
+        """
+        Given an input example, apply a pattern to obtain two text sequences (text_a and text_b) containing exactly one
+        mask token (or one consecutive sequence of mask tokens for PET with multiple masks). If a task requires only a
+        single sequence of text, the second sequence should be an empty list.
+
+        :param example: the input example to process
+        :return: Two sequences of text. All text segments can optionally be marked as being shortenable.
+        """
+        pass
+
+    def get_answers(self, example: InputExample):
+        return [self.verbalize(label)[0] for label in self.label_list]
+
+    def get_verbalizer_ids(self):
+        target_ids = []
+        for label in self.label_list:
+            verbalizer = self.verbalize(label)[0]
+            verbalizer_id = get_verbalization_ids(
+                verbalizer, self.tokenizer, force_single_token=True)
+            target_ids.append(verbalizer_id)
+        return target_ids
+
+    @abstractmethod
+    def verbalize(self, label) -> List[str]:
+        """
+        Return all verbalizations for a given label.
+
+        :param label: the label
+        :return: the list of verbalizations
+        """
+        pass
+
+    def get_mask_positions(self, input_ids: List[int]) -> List[int]:
+        label_idx = input_ids.index(self.mask_id)
+        labels = [-1] * len(input_ids)
+        labels[label_idx] = 1
+        return labels
+
+    @staticmethod
+    def _load_verbalizer_from_file(path: str, pattern_id: int):
+
+        verbalizers = defaultdict(
+            dict)  # type: Dict[int, Dict[str, List[str]]]
+        current_pattern_id = None
+
+        with open(path, 'r') as fh:
+            for line in fh.read().splitlines():
+                if line.isdigit():
+                    current_pattern_id = int(line)
+                elif line:
+                    label, *realizations = line.split()
+                    verbalizers[current_pattern_id][label] = realizations
+
+        print_rank_0(
+            'Automatically loaded the following verbalizer: \n {}'.format(
+                verbalizers[pattern_id]))
+
+        def verbalize(label) -> List[str]:
+            return verbalizers[pattern_id][label]
+
+        return verbalize
+
+
+class CopaPVP(PVP):
+
+    @staticmethod
+    def available_patterns():
+        return [0, 1]
+
+    @property
+    def is_multi_token(self):
+        return True
+
+    @property
+    def spell_length(self):
+        return self.num_prompt_tokens + self.prefix_prompt
+
+    @property
+    def mask(self) -> str:
+        """Return the underlying LM's mask token"""
+        mask_token = 'MASK'
+        return self.tokenizer.get_command(mask_token).Id
+
+    @property
+    def mask_id(self) -> int:
+        """Return the underlying LM's mask id"""
+        mask_token = 'MASK'
+        return self.tokenizer.get_command(mask_token).Id
+
+    def get_answers(self, example: InputExample):
+        choice1 = ' ' + self.remove_final_punc(
+            self.lowercase_first(example.meta['choice1']))
+        choice2 = ' ' + self.remove_final_punc(
+            self.lowercase_first(example.meta['choice2']))
+        return [choice1, choice2]
+
+    def get_parts(self, example: InputExample) -> FilledPattern:
+        assert self.pattern_id in [0, 1, 2, 3]
+        premise = self.remove_final_punc(
+            self.shortenable(' ' + example.text_a))
+        choice1 = self.remove_final_punc(
+            self.lowercase_first(example.meta['choice1']))
+        choice2 = self.remove_final_punc(
+            self.lowercase_first(example.meta['choice2']))
+
+        question = example.meta['question']
+        assert question in ['cause', 'effect']
+        if question == 'cause':
+            joiner = ' because'
+        else:
+            joiner = ', so'
+        if self.pattern_id == 0:
+            parts_a, parts_b = [
+                None, '"', choice1, '" or "', choice2, '"?', None, premise,
+                joiner, None, [self.mask], '.'
+            ], []
+        elif self.pattern_id == 1:
+            parts_a, parts_b = [
+                None, choice1, ' or', ' ' + choice2, '?', None, premise,
+                joiner, None, [self.mask], '.'
+            ], []
+        elif self.pattern_id == 2:
+            parts_a, parts_b = [
+                None, '"', choice1, '" or "', choice2, '"', None, premise,
+                joiner, [self.mask], '.', None
+            ], []
+        else:
+            raise NotImplementedError(self.pattern_id)
+        parts_a, parts_b = self.replace_prompt_tokens(parts_a, parts_b)
+        return parts_a, parts_b
+
+    def verbalize(self, label) -> List[str]:
+        return []
+
+    def encode(self,
+               example: InputExample,
+               priming: bool = False,
+               labeled: bool = False):
+        """
+        Encode an input example using this pattern-verbalizer pair.
+
+        :param example: the input example to encode
+        :param priming: whether to use this example for priming
+        :param labeled: if ``priming=True``, whether the label should be appended to this example
+        :return: A tuple, consisting of a list of input ids and a list of token type ids
+        """
+        if self.continuous_prompt or self.pattern_id < 2:
+            return super().encode(example, priming=priming, labeled=labeled)
+        if not priming:
+            assert not labeled, "'labeled' can only be set to true if 'priming' is also set to true"
+
+        tokenizer = self.tokenizer
+        premise = self.remove_final_punc(self.shortenable(example.text_a))
+        choice1 = ' ' + self.remove_final_punc(
+            self.lowercase_first(example.meta['choice1']))
+        choice2 = ' ' + self.remove_final_punc(
+            self.lowercase_first(example.meta['choice2']))
+        question = example.meta['question']
+        assert question in ['cause', 'effect']
+        answer = ' because' if question == 'cause' else ' so'
+        answer_ids = [
+            get_verbalization_ids(answer, tokenizer, force_single_token=True)
+        ]
+        if self.is_multi_token:
+            answer_ids.append(tokenizer.get_command('eop').Id)
+
+        ids_list, positions_list, sep_list, mask_list, target_list = [], [], [], [], []
+
+        for choice in [choice1, choice2]:
+            parts = [
+                '"', choice1[1:], '" or "', choice2[1:], '"?', premise,
+                [self.mask], choice
+            ]
+            parts = [x if isinstance(x, tuple) else (x, False) for x in parts]
+            parts = [(tokenizer.EncodeAsIds(x).tokenization if isinstance(
+                x, str) else x, s) for x, s in parts if x]
+            self.num_truncated += self.truncate(
+                parts, None, answer_ids, max_length=self.max_seq_length)
+            tokens_a = [token_id for part, _ in parts for token_id in part]
+            data = build_input_from_ids(
+                tokens_a,
+                None,
+                answer_ids,
+                self.max_seq_length,
+                self.tokenizer,
+                args=self.args,
+                add_cls=True,
+                add_sep=False,
+                add_piece=True)
+            ids, types, paddings, position_ids, sep, target_ids, loss_masks = data
+            ids_list.append(ids)
+            positions_list.append(position_ids)
+            sep_list.append(sep)
+            target_list.append(target_ids)
+            mask_list.append(loss_masks)
+        if example.label is not None:
+            label = self.label_list.index(example.label)
+        else:
+            label = 0
+        sample = build_sample(
+            ids_list,
+            positions=positions_list,
+            masks=sep_list,
+            label=label,
+            logit_mask=mask_list,
+            target=target_list,
+            unique_id=example.guid)
+        return sample
+
+
+class WscPVP(PVP):
+
+    @staticmethod
+    def available_patterns():
+        return [0, 1, 2]
+
+    @property
+    def is_multi_token(self):
+        return True
+
+    @property
+    def spell_length(self):
+        return self.num_prompt_tokens + self.prefix_prompt
+
+    def get_answers(self, example: InputExample):
+        target = ' ' + example.meta['span1_text']
+        answers = [target]
+        if 'candidates' in example.meta:
+            candidates = example.meta['candidates']
+            # if len(candidates) > 10:
+            #     random.shuffle(candidates)
+            #     candidates = candidates[:10]
+            answers += [' ' + cand for cand in candidates]
+        return answers
+
+    def get_parts(self, example: InputExample) -> FilledPattern:
+        pronoun = example.meta['span2_text']
+        pronoun_idx = example.meta['span2_index']
+
+        words_a = example.text_a.split()
+        words_a[pronoun_idx] = '*' + words_a[pronoun_idx] + '*'
+        text_a = ' '.join(words_a)
+        text_a = self.shortenable(text_a)
+
+        if self.pattern_id == 0:
+            parts_a, parts_b = [
+                None, text_a,
+                None, " The pronoun '*" + pronoun + "*' refers to", None,
+                [self.mask], '.'
+            ], []
+        elif self.pattern_id == 1:
+            parts_a, parts_b = [
+                None, text_a, None, " In the previous sentence, the pronoun '*"
+                + pronoun + "*' refers to", None, [self.mask], '.'
+            ], []
+        elif self.pattern_id == 2:
+            parts_a, parts_b = [
+                None, text_a, None,
+                " Question: In the passage above, what does the pronoun '*"
+                + pronoun + "*' refer to?", None, ' Answer:', [self.mask], '.'
+            ], []
+        else:
+            raise NotImplementedError(self.pattern_id)
+        parts_a, parts_b = self.replace_prompt_tokens(parts_a, parts_b)
+        return parts_a, parts_b
+
+    def encode(self,
+               example: InputExample,
+               priming: bool = False,
+               labeled: bool = False):
+        """
+        Encode an input example using this pattern-verbalizer pair.
+
+        :param example: the input example to encode
+        :param priming: whether to use this example for priming
+        :param labeled: if ``priming=True``, whether the label should be appended to this example
+        :return: A tuple, consisting of a list of input ids and a list of token type ids
+        """
+        if self.args.loss_func in ['generative', 'mix']:
+            sample = super().encode(example, priming=priming, labeled=labeled)
+            if self.split == 'train':
+                sample['label'] = 0
+            return sample
+
+        if not priming:
+            assert not labeled, "'labeled' can only be set to true if 'priming' is also set to true"
+
+        tokenizer = self.tokenizer
+        prompt_id = tokenizer.num_tokens
+        raw_parts_a, raw_parts_b = self.get_parts(example)
+
+        raw_parts_a = [
+            x if isinstance(x, tuple) else (x, False) for x in raw_parts_a
+        ]
+
+        def encode_input(raw_parts):
+            parts = []
+            for x, s in raw_parts:
+                if isinstance(x, str):
+                    x = tokenizer.EncodeAsIds(x)
+                elif isinstance(x, int):
+                    x = [prompt_id] * x
+                else:
+                    pass
+                parts.append((x, s))
+            return parts
+
+        parts_a = encode_input(raw_parts_a)
+        if self.prefix_prompt > 0:
+            parts_a = [([prompt_id] * self.prefix_prompt, False)] + parts_a
+        parts_b = None
+        if raw_parts_b:
+            raw_parts_b = [
+                x if isinstance(x, tuple) else (x, False) for x in raw_parts_b
+            ]
+            parts_b = encode_input(raw_parts_b)
+        answer = self.get_answers(example)[0]
+        answer_ids = get_verbalization_ids(
+            answer, tokenizer, force_single_token=False)
+        answer_ids = answer_ids + [tokenizer.get_command('eop').Id]
+        self.num_truncated += self.truncate(
+            parts_a, parts_b, answer_ids, max_length=self.max_seq_length)
+        tokens_a = [token_id for part, _ in parts_a for token_id in part]
+        tokens_b = [token_id for part, _ in parts_b
+                    for token_id in part] if parts_b else None
+        data = build_input_from_ids(
+            tokens_a,
+            tokens_b,
+            answer_ids,
+            self.max_seq_length,
+            self.tokenizer,
+            args=self.args,
+            add_cls=True,
+            add_sep=False,
+            add_piece=True)
+        ids, types, paddings, position_ids, sep, target_ids, loss_masks = data
+        prompt_pos = [
+            idx for idx, token in enumerate(ids) if token == prompt_id
+        ]
+        ids = [token if token != prompt_id else 0 for token in ids]
+        if example.label is not None:
+            label = self.label_list.index(example.label)
+        else:
+            label = 0
+        return {
+            'text': np.array(ids, dtype=np.int64),
+            'target': np.array(target_ids, dtype=np.int64),
+            'attention_mask': np.array(sep, dtype=np.int64),
+            'loss_mask': np.array(loss_masks, dtype=np.int64),
+            'position_id': np.array(position_ids, dtype=np.int64),
+            'prompt_pos': np.array(prompt_pos, dtype=np.int64),
+            'label': label,
+            'uid': example.guid
+        }
+
+    def verbalize(self, label) -> List[str]:
+        return []
+
+
+class RecordPVP(PVP):
+
+    @property
+    def is_multi_token(self):
+        return True
+
+    def get_answers(self, example: InputExample):
+        choices = example.meta['candidates']
+        choices = [' ' + choice for choice in choices]
+        return choices
+
+    def get_parts(self, example: InputExample) -> FilledPattern:
+        premise = self.shortenable(example.text_a)
+
+        assert '@placeholder' in example.text_b, f'question "{example.text_b}" does not contain a @placeholder token'
+        question_a, question_b = example.text_b.split('@placeholder')
+        return [premise, ' ' + question_a.rstrip(), [self.mask],
+                question_b], []
+
+    def verbalize(self, label) -> List[str]:
+        return []
+
+
+class RacePVP(PVP):
+
+    @property
+    def is_multi_token(self):
+        return True
+
+    @staticmethod
+    def available_patterns():
+        return [0, 1]
+
+    def get_answers(self, example: InputExample):
+        choices = example.meta['choices']
+        choices = [' ' + choice for choice in choices]
+        return choices
+
+    def get_parts(self, example: InputExample) -> FilledPattern:
+        context = self.shortenable(example.text_a)
+        question = ' ' + example.text_b
+
+        if '_' in question:
+            left, right = question.split('_', maxsplit=1)
+            if self.pattern_id == 0:
+                return [context], [
+                    self.shortenable(left.rstrip()), [self.mask],
+                    self.shortenable(right)
+                ]
+            else:
+                left = left.rstrip()
+                if left:
+                    left = self.lowercase_first(left)
+                return [context], [
+                    ' Based on the previous passage,',
+                    self.shortenable(left), [self.mask],
+                    self.shortenable(right)
+                ]
+        else:
+            if self.pattern_id == 0:
+                return [context], [
+                    ' Question:',
+                    self.shortenable(question), ' Answer:', [self.mask]
+                ]
+            else:
+                return [context], [
+                    ' Based on the previous passage,',
+                    self.shortenable(question), [self.mask]
+                ]
+
+    def verbalize(self, label) -> List[str]:
+        return []
+
+
+class RtePVP(PVP):
+    VERBALIZER = {'not_entailment': [' No'], 'entailment': [' Yes']}
+
+    @staticmethod
+    def available_patterns():
+        return [0, 1, 2, 3, 4]
+
+    @property
+    def spell_length(self):
+        return self.num_prompt_tokens + self.prefix_prompt
+
+    def get_parts(self, example: InputExample) -> FilledPattern:
+        # switch text_a and text_b to get the correct order
+        text_a = example.text_a
+        text_b = example.text_b.rstrip(string.punctuation)
+        if self.pattern_id == 0:
+            parts_a, parts_b = [None, '"',
+                                self.shortenable(text_b), '" ?'], [
+                                    None, [self.mask], ',', None, ' "',
+                                    self.shortenable(text_a), '"'
+                                ]  # noqa
+        elif self.pattern_id == 1:
+            parts_a, parts_b = [None, self.shortenable(text_b), '?'], [
+                None, [self.mask], ',', None,
+                self.shortenable(' ' + text_a)
+            ]
+        elif self.pattern_id == 2:
+            parts_a, parts_b = [None, '"',
+                                self.shortenable(text_b), '" ?'], [
+                                    None, [self.mask], '. "', None,
+                                    self.shortenable(text_a), '"'
+                                ]  # noqa
+        elif self.pattern_id == 3:
+            parts_a, parts_b = [None, self.shortenable(text_b), '?'], [
+                None, [self.mask], '.', None,
+                self.shortenable(' ' + text_a)
+            ]
+        elif self.pattern_id == 4:
+            parts_a, parts_b = [
+                None,
+                self.shortenable(text_a), None, ' question:',
+                self.shortenable(' ' + text_b), ' True or False?', None,
+                ' answer:', [self.mask]
+            ], []
+        else:
+            raise NotImplementedError(self.pattern_id)
+        parts_a, parts_b = self.replace_prompt_tokens(parts_a, parts_b)
+        return parts_a, parts_b
+
+    def verbalize(self, label) -> List[str]:
+        if self.pattern_id == 4:
+            return [' true'] if label == 'entailment' else [' false']
+        return RtePVP.VERBALIZER[label]
+
+
+class CbPVP(RtePVP):
+    VERBALIZER = {
+        'contradiction': [' No'],
+        'entailment': [' Yes'],
+        'neutral': [' Maybe']
+    }
+
+    @staticmethod
+    def available_patterns():
+        return [0, 1, 2, 3, 4]
+
+    def get_parts(self, example: InputExample) -> FilledPattern:
+        if self.pattern_id == 4:
+            text_a = self.shortenable(example.text_a)
+            text_b = self.shortenable(' ' + example.text_b)
+            parts_a, parts_b = [
+                None, text_a, None, ' question:', text_b,
+                ' true, false or neither?', None, ' answer:', [self.mask]
+            ], []
+            parts_a, parts_b = self.replace_prompt_tokens(parts_a, parts_b)
+            return parts_a, parts_b
+        return super().get_parts(example)
+
+    def verbalize(self, label) -> List[str]:
+        if self.pattern_id == 4:
+            return [' true'] if label == 'entailment' else [
+                ' false'
+            ] if label == 'contradiction' else [' neither']
+        return CbPVP.VERBALIZER[label]
+
+
+class BoolQPVP(PVP):
+    VERBALIZER_A = {'false': [' No'], 'true': [' Yes']}
+
+    VERBALIZER_B = {'false': [' false'], 'true': [' true']}
+
+    @staticmethod
+    def available_patterns():
+        return [0, 1, 2, 3, 4, 5]
+
+    @property
+    def spell_length(self):
+        return self.num_prompt_tokens + self.prefix_prompt
+
+    def get_parts(self, example: InputExample) -> FilledPattern:
+        passage = example.text_a
+        question = example.text_b
+
+        if self.pattern_id < 2:
+            parts_a, parts_b = [
+                None,
+                self.shortenable(passage), None, ' Question:',
+                self.shortenable(' ' + question), '? Answer:', None,
+                [self.mask], '.'
+            ], []
+        elif self.pattern_id < 4:
+            parts_a, parts_b = [
+                None,
+                self.shortenable(passage), ' Based on the previous passage,',
+                None,
+                self.shortenable(' ' + question), '?', None, [self.mask], '.'
+            ], []
+        elif self.pattern_id < 6:
+            parts_a, parts_b = [
+                'Based on the following passage', None,
+                self.shortenable(' ' + question), '?', None, [self.mask], '.',
+                None,
+                self.shortenable(' ' + passage)
+            ], []
+        else:
+            raise NotImplementedError(self.pattern_id)
+        parts_a, parts_b = self.replace_prompt_tokens(parts_a, parts_b)
+        return parts_a, parts_b
+
+    def verbalize(self, label) -> List[str]:
+        if self.pattern_id == 0 or self.pattern_id == 2 or self.pattern_id == 4:
+            return BoolQPVP.VERBALIZER_A[label]
+        else:
+            return BoolQPVP.VERBALIZER_B[label]
+
+
+class MultiRcPVP(PVP):
+    VERBALIZER = {0: [' No'], 1: [' Yes']}
+
+    @staticmethod
+    def available_patterns():
+        return [0, 1, 2, 3, 4]
+
+    @property
+    def spell_length(self):
+        return self.num_prompt_tokens + self.prefix_prompt
+
+    def get_parts(self, example: InputExample) -> FilledPattern:
+        passage = self.remove_final_punc(
+            self.shortenable(example.text_a.rstrip()))
+        question = self.remove_final_punc(example.text_b.rstrip())
+        answer = example.meta['answer']
+        if self.pattern_id == 0:
+            parts_a, parts_b = [
+                passage, '.', None, ' Question:', ' ' + question + '?', None,
+                ' Is it', ' ' + answer, '?', None, [self.mask], '.'
+            ], []
+        elif self.pattern_id == 1:
+            parts_a, parts_b = [
+                passage, '.', None, ' Question:', ' ' + question, '?',
+                None, ' Is the correct answer "', answer, '"?', None,
+                [self.mask], '.'
+            ], []
+        elif self.pattern_id == 2:
+            parts_a, parts_b = [
+                passage, '. Based on the previous passage,', None,
+                ' ' + question, '?', None, ' Is "', answer,
+                '" a correct answer?', None, [self.mask], '.'
+            ], []
+        elif self.pattern_id == 3:
+            parts_a, parts_b = [
+                None, passage, None, ' ' + question, '- [', [self.mask], ']',
+                None, answer
+            ], []
+        elif self.pattern_id == 4:
+            parts_a, parts_b = [
+                passage, '.', None, ' Question:', ' ' + question, '?', None,
+                ' ' + answer, '?', None, [self.mask], '.'
+            ], []
+        else:
+            raise NotImplementedError(self.pattern_id)
+        parts_a, parts_b = self.replace_prompt_tokens(parts_a, parts_b)
+        return parts_a, parts_b
+
+    def verbalize(self, label) -> List[str]:
+        if self.pattern_id == 3:
+            return [' False'] if label == 0 else [' True']
+        return MultiRcPVP.VERBALIZER[label]
+
+
+class WicPVP(PVP):
+    VERBALIZER_A = {'false': [' No'], 'true': [' Yes']}
+    VERBALIZER_B = {'false': ['2'], 'true': ['b']}
+
+    @staticmethod
+    def available_patterns():
+        return [0, 1, 2]
+
+    @property
+    def spell_length(self):
+        return self.num_prompt_tokens + self.prefix_prompt
+
+    def get_parts(self, example: InputExample) -> FilledPattern:
+        text_a = example.text_a
+        text_b = example.text_b
+        word = example.meta['word']
+
+        if self.pattern_id == 0:
+            parts_a, parts_b = [
+                None,
+                self.shortenable('"' + text_a + '" / "' + text_b + '"'), None,
+                ' Similar sense of "' + word + '"?', None, [self.mask], '.'
+            ], []
+        elif self.pattern_id == 1:
+            parts_a, parts_b = [
+                self.shortenable(text_a), None,
+                self.shortenable(' ' + text_b), None,
+                ' Does ' + word + ' have the same meaning in both sentences?',
+                None, [self.mask]
+            ], []
+        elif self.pattern_id == 2:
+            parts_a, parts_b = [
+                None, word, ' .', None, ' Sense (1) (a) "',
+                self.shortenable(text_a), '"', None, ' (', [self.mask], ') "',
+                text_b, '"'
+            ], []
+        else:
+            raise NotImplementedError(self.pattern_id)
+        parts_a, parts_b = self.replace_prompt_tokens(parts_a, parts_b)
+        return parts_a, parts_b
+
+    def verbalize(self, label) -> List[str]:
+        if self.pattern_id == 2:
+            return WicPVP.VERBALIZER_B[label]
+        return WicPVP.VERBALIZER_A[label]
+
+
+class AgnewsPVP(PVP):
+    VERBALIZER = {
+        '1': [' World'],
+        '2': [' Sports'],
+        '3': [' Business'],
+        '4': [' Tech']
+    }
+
+    @staticmethod
+    def available_patterns():
+        return [0, 1, 2, 3, 4, 5]
+
+    def get_parts(self, example: InputExample) -> FilledPattern:
+
+        text_a = self.shortenable(example.text_a)
+        text_b = self.shortenable(example.text_b)
+
+        if self.pattern_id == 0:
+            return [[self.mask], ':', text_a, text_b], []
+        elif self.pattern_id == 1:
+            return [[self.mask], ' News:', text_a, text_b], []
+        elif self.pattern_id == 2:
+            return [text_a, '(', [self.mask], ')', text_b], []
+        elif self.pattern_id == 3:
+            return [text_a, text_b, '(', [self.mask], ')'], []
+        elif self.pattern_id == 4:
+            return ['[ Category:', [self.mask], ']', text_a, text_b], []
+        elif self.pattern_id == 5:
+            return [[self.mask], '-', text_a, text_b], []
+        else:
+            raise ValueError('No pattern implemented for id {}'.format(
+                self.pattern_id))
+
+    def verbalize(self, label) -> List[str]:
+        return AgnewsPVP.VERBALIZER[label]
+
+
+class YahooPVP(PVP):
+    VERBALIZER = {
+        '1': [' Society'],
+        '2': [' Science'],
+        '3': [' Health'],
+        '4': [' Education'],
+        '5': [' Computer'],
+        '6': [' Sports'],
+        '7': [' Business'],
+        '8': [' Entertainment'],
+        '9': [' Relationship'],
+        '10': [' Politics'],
+    }
+
+    @staticmethod
+    def available_patterns():
+        return [0, 1, 2, 3, 4, 5]
+
+    def get_parts(self, example: InputExample) -> FilledPattern:
+
+        text_a = self.shortenable(example.text_a)
+        text_b = self.shortenable(example.text_b)
+
+        if self.pattern_id == 0:
+            return [[self.mask], ':', text_a, text_b], []
+        elif self.pattern_id == 1:
+            return [[self.mask], ' Question:', text_a, text_b], []
+        elif self.pattern_id == 2:
+            return [text_a, '(', [self.mask], ')', text_b], []
+        elif self.pattern_id == 3:
+            return [text_a, text_b, '(', [self.mask], ')'], []
+        elif self.pattern_id == 4:
+            return ['[ Category:', [self.mask], ']', text_a, text_b], []
+        elif self.pattern_id == 5:
+            return [[self.mask], '-', text_a, text_b], []
+        else:
+            raise ValueError('No pattern implemented for id {}'.format(
+                self.pattern_id))
+
+    def verbalize(self, label) -> List[str]:
+        return YahooPVP.VERBALIZER[label]
+
+
+class MnliPVP(PVP):
+    VERBALIZER_A = {
+        'contradiction': [' Wrong'],
+        'entailment': [' Right'],
+        'neutral': [' Maybe']
+    }
+    VERBALIZER_B = {
+        'contradiction': [' No'],
+        'entailment': [' Yes'],
+        'neutral': [' Maybe']
+    }
+
+    @staticmethod
+    def available_patterns():
+        return [0, 1, 2, 3]
+
+    def get_parts(self, example: InputExample) -> FilledPattern:
+        text_a = self.shortenable(self.remove_final_punc(example.text_a))
+        text_b = self.shortenable(example.text_b)
+
+        if self.pattern_id == 0 or self.pattern_id == 2:
+            return ['"', text_a, '" ?'], [[self.mask], ', "', text_b, '"']
+        elif self.pattern_id == 1 or self.pattern_id == 3:
+            return [text_a, '?'], [[self.mask], ',', text_b]
+
+    def verbalize(self, label) -> List[str]:
+        if self.pattern_id == 0 or self.pattern_id == 1:
+            return MnliPVP.VERBALIZER_A[label]
+        return MnliPVP.VERBALIZER_B[label]
+
+
+class YelpPolarityPVP(PVP):
+    VERBALIZER = {'1': [' bad'], '2': [' good']}
+
+    @staticmethod
+    def available_patterns():
+        return [0, 1, 2, 3]
+
+    def get_parts(self, example: InputExample) -> FilledPattern:
+        text = self.shortenable(example.text_a)
+
+        if self.pattern_id == 0:
+            return ['It was', [self.mask], '.', text], []
+        elif self.pattern_id == 1:
+            return [text, '. All in all, it was', [self.mask], '.'], []
+        elif self.pattern_id == 2:
+            return ['Just', [self.mask], '!'], [text]
+        elif self.pattern_id == 3:
+            return [text], [' In summary, the restaurant is', [self.mask], '.']
+        else:
+            raise ValueError('No pattern implemented for id {}'.format(
+                self.pattern_id))
+
+    def verbalize(self, label) -> List[str]:
+        return YelpPolarityPVP.VERBALIZER[label]
+
+
+class YelpFullPVP(YelpPolarityPVP):
+    VERBALIZER = {
+        '1': [' terrible'],
+        '2': [' bad'],
+        '3': [' okay'],
+        '4': [' good'],
+        '5': [' great']
+    }
+
+    def verbalize(self, label) -> List[str]:
+        return YelpFullPVP.VERBALIZER[label]
+
+
+class XStancePVP(PVP):
+    VERBALIZERS = {
+        'en': {
+            'FAVOR': ['Yes'],
+            'AGAINST': ['No']
+        },
+        'de': {
+            'FAVOR': ['Ja'],
+            'AGAINST': ['Nein']
+        },
+        'fr': {
+            'FAVOR': ['Oui'],
+            'AGAINST': ['Non']
+        }
+    }
+
+    @staticmethod
+    def available_patterns():
+        return [0, 1, 2, 3, 4, 5]
+
+    def get_parts(self, example: InputExample) -> FilledPattern:
+
+        text_a = self.shortenable(example.text_a)
+        text_b = self.shortenable(example.text_b)
+
+        if self.pattern_id == 0 or self.pattern_id == 2 or self.pattern_id == 4:
+            return ['"', text_a, '"'], [[self.mask], '. "', text_b, '"']
+        elif self.pattern_id == 1 or self.pattern_id == 3 or self.pattern_id == 5:
+            return [text_a], [[self.mask], '.', text_b]
+
+    def verbalize(self, label) -> List[str]:
+        lang = 'de' if self.pattern_id < 2 else 'en' if self.pattern_id < 4 else 'fr'
+        return XStancePVP.VERBALIZERS[lang][label]
+
+
+class Sst2PVP(PVP):
+    VERBALIZER_A = {'0': [' terrible'], '1': [' great']}
+
+    VERBALIZER_B = {'0': [' bad'], '1': [' good']}
+
+    @staticmethod
+    def available_patterns():
+        return [0, 1]
+
+    def get_parts(self, example: InputExample) -> FilledPattern:
+        text = self.shortenable(example.text_a)
+        if self.pattern_id == 0 or self.pattern_id == 1:
+            return [text, ' It was', [self.mask], '.'], []
+        else:
+            raise ValueError('No pattern implemented for id {}'.format(
+                self.pattern_id))
+
+    def verbalize(self, label) -> List[str]:
+        if self.pattern_id == 0:
+            return Sst2PVP.VERBALIZER_A[label]
+        else:
+            return Sst2PVP.VERBALIZER_B[label]
+
+
+class ColaPVP(PVP):
+    VERBALIZER = {'0': [' incorrect'], '1': [' correct']}
+
+    def get_parts(self, example: InputExample) -> FilledPattern:
+        text = self.shortenable(example.text_a)
+        if self.pattern_id == 0:
+            return ['"', text, '"', ' This is', [self.mask], '.'], []
+        else:
+            raise ValueError('No pattern implemented for id {}'.format(
+                self.pattern_id))
+
+    def verbalize(self, label) -> List[str]:
+        return ColaPVP.VERBALIZER[label]
+
+
+class MrpcPVP(PVP):
+    VERBALIZER = {'0': [' No'], '1': [' Yes']}
+
+    @staticmethod
+    def available_patterns():
+        return [0, 1]
+
+    def get_parts(self, example: InputExample) -> FilledPattern:
+        text_a = self.shortenable(example.text_a)
+        if self.pattern_id == 0:
+            text_b = self.shortenable(self.lowercase_first(example.text_b))
+            return [text_a], [[self.mask], ', ', text_b]
+        elif self.pattern_id == 1:
+            text_b = self.shortenable(
+                self.remove_final_punc(self.lowercase_first(example.text_b)))
+            return [text_a], [' Does it mean that', text_b, '?', [self.mask]]
+        else:
+            raise ValueError('No pattern implemented for id {}'.format(
+                self.pattern_id))
+
+    def verbalize(self, label) -> List[str]:
+        return MrpcPVP.VERBALIZER[label]
+
+
+class QqpPVP(PVP):
+    VERBALIZER = {'0': [' No'], '1': [' Yes']}
+
+    @staticmethod
+    def available_patterns():
+        return [0, 1]
+
+    def get_parts(self, example: InputExample) -> FilledPattern:
+        text_a = self.shortenable(example.text_a)
+        text_b = self.shortenable(self.lowercase_first(example.text_b))
+        if self.pattern_id == 0:
+            return [text_a], [' Do you mean ', text_b, [self.mask], '.']
+        elif self.pattern_id == 1:
+            return [text_a], [[self.mask], ', ', text_b]
+        else:
+            raise ValueError('No pattern implemented for id {}'.format(
+                self.pattern_id))
+
+    def verbalize(self, label) -> List[str]:
+        return QqpPVP.VERBALIZER[label]
+
+
+class QnliPVP(PVP):
+    VERBALIZER = {'not_entailment': [' No'], 'entailment': [' Yes']}
+
+    @staticmethod
+    def available_patterns():
+        return [0, 1, 2]
+
+    def get_parts(self, example: InputExample) -> FilledPattern:
+        question = self.remove_final_punc(example.text_a)
+        passage = example.text_b
+        if self.pattern_id == 0:
+            return [
+                self.shortenable(passage), ' Question:',
+                self.shortenable(' ' + question), '? Do you know the answer?',
+                [self.mask], '.'
+            ], []
+        elif self.pattern_id == 1:
+            return [
+                self.shortenable(passage),
+                ' Based on the previous passage, do you know the answer',
+                self.shortenable(' ' + question), '?', [self.mask], '.'
+            ], []
+        elif self.pattern_id == 2:
+            return [
+                'Based on the following passage, do you know the answer',
+                self.shortenable(' ' + question), '?', [self.mask], '.',
+                self.shortenable(' ' + passage)
+            ], []
+        else:
+            raise ValueError('No pattern implemented for id {}'.format(
+                self.pattern_id))
+
+    def verbalize(self, label) -> List[str]:
+        return QnliPVP.VERBALIZER[label]
+
+
+class SquadPVP(PVP):
+
+    @property
+    def is_multi_token(self):
+        return True
+
+    def get_answers(self, example: InputExample):
+        target = ' ' + example.meta['answer']['text']
+        answers = [target]
+        return answers
+
+    def get_parts(self, example: InputExample) -> FilledPattern:
+        context = self.shortenable(example.text_a)
+        question = example.text_b
+        return [context, ' ' + question, [self.mask], '.'], []
+
+    def verbalize(self, label) -> List[str]:
+        return []
+
+
+def get_verbalization_ids(word: str, tokenizer,
+                          force_single_token: bool) -> Union[int, List[int]]:
+    """
+    Get the token ids corresponding to a verbalization
+
+    :param word: the verbalization
+    :param tokenizer: the tokenizer to use
+    :param force_single_token: whether it should be enforced that the verbalization corresponds to a single token.
+           If set to true, this method returns a single int instead of a list and throws an error if the word
+           corresponds to multiple tokens.
+    :return: either the list of token ids or the single token id corresponding to this word
+    """
+    ids = tokenizer.EncodeAsIds(word).tokenization
+    if not force_single_token:
+        return ids
+    assert len(ids) == 1, \
+        f'Verbalization "{word}" does not correspond to a single token, got {tokenizer.DecodeIds(ids)}'
+    verbalization_id = ids[0]
+    assert verbalization_id not in tokenizer.command_id_map, \
+        f'Verbalization {word} is mapped to a special token {tokenizer.IdToToken(verbalization_id)}'
+    return verbalization_id
+
+
+PVPS = {
+    'agnews': AgnewsPVP,
+    'mnli': MnliPVP,
+    'yelp-polarity': YelpPolarityPVP,
+    'yelp-full': YelpFullPVP,
+    'yahoo': YahooPVP,
+    'xstance': XStancePVP,
+    'xstance-de': XStancePVP,
+    'xstance-fr': XStancePVP,
+    'rte': RtePVP,
+    'wic': WicPVP,
+    'cb': CbPVP,
+    'wsc': WscPVP,
+    'boolq': BoolQPVP,
+    'copa': CopaPVP,
+    'multirc': MultiRcPVP,
+    'record': RecordPVP,
+    'ax-b': RtePVP,
+    'ax-g': RtePVP,
+    'sst2': Sst2PVP,
+    'cola': ColaPVP,
+    'mrpc': MrpcPVP,
+    'qqp': QqpPVP,
+    'qnli': QnliPVP,
+    'squad': SquadPVP,
+    'race': RacePVP,
+}
diff --git a/modelscope/models/nlp/mglm/test/__init__.py b/modelscope/models/nlp/mglm/test/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/nlp/mglm/test/test_block.py b/modelscope/models/nlp/mglm/test/test_block.py
new file mode 100644
index 00000000..ed4225da
--- /dev/null
+++ b/modelscope/models/nlp/mglm/test/test_block.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2022 Zhipu.AI
+
+import random
+from argparse import Namespace
+
+import numpy as np
+from blocklm_utils import ConstructBlockStrategy
+
+
+# rng = random.Random()
+# span_lengths = [2, 3, 4, 2, 3, 4]
+# length = 100
+#
+# counts = np.array([0] * length)
+# for _ in range(10000):
+#     rng.shuffle(span_lengths)
+#     spans = ConstructBlockStrategy.sample_spans(span_lengths, length, rng)
+#     for start, end in spans:
+#         counts[start: end] += 1
+# print(counts)
+def main():
+    args = Namespace()
+    args.seq_length = 10
+    args.eod_token = 0
+
+    strategy = ConstructBlockStrategy(
+        args, None, bert_ratio=0.4, max_seq_length=128)
+    counts = np.array([0] * 10)
+    for _ in range(10000):
+        spans = strategy.sample_span_in_document(
+            np.array([1, 2, 3, 0, 4, 5, 6, 7, 9, 0], dtype=np.long), [1, 1],
+            random.Random())
+        for start, end in spans:
+            counts[start:end] += 1
+
+    print(counts)
diff --git a/modelscope/models/nlp/mglm/test/test_rel_shift.py b/modelscope/models/nlp/mglm/test/test_rel_shift.py
new file mode 100644
index 00000000..00cbb9fe
--- /dev/null
+++ b/modelscope/models/nlp/mglm/test/test_rel_shift.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2022 Zhipu.AI
+
+import matplotlib.pyplot as plt
+import numpy as np
+from learning_rates import AnnealingLR
+from torch.nn.modules import Linear
+from torch.optim import Adam
+
+
+def main():
+    model = Linear(10, 10)
+    optimizer = Adam(model.parameters())
+    lr_scheduler = AnnealingLR(
+        optimizer,
+        start_lr=0.00015,
+        warmup_iter=3000,
+        num_iters=300000,
+        decay_style='cosine',
+        decay_ratio=0.1)
+    steps = np.arange(0, 400000, 10, dtype=np.long)
+    rates = []
+    for step in steps:
+        lr_scheduler.num_iters = step
+        rates.append(lr_scheduler.get_lr())
+    print(rates)
+    plt.plot(steps, rates)
+    plt.savefig('lr.pdf', format='pdf')
diff --git a/modelscope/models/nlp/mglm/train_utils.py b/modelscope/models/nlp/mglm/train_utils.py
new file mode 100644
index 00000000..c9c0de8e
--- /dev/null
+++ b/modelscope/models/nlp/mglm/train_utils.py
@@ -0,0 +1,472 @@
+# Copyright (c) 2022 Zhipu.AI
+
+import deepspeed
+import torch
+from apex.optimizers import FusedAdam as Adam
+from torch import distributed as dist
+
+from . import mpu
+from .fp16 import DynamicLossScaler, FP16_Module, FP16_Optimizer
+from .model import DistributedDataParallel as LocalDDP
+from .model import (GLMForMultiTokenCloze, GLMForMultiTokenClozeFast,
+                    GLMForSequenceClassification, GLMForSingleTokenCloze,
+                    GLMModel)
+from .model import PyTorchDistributedDataParallel as TorchDDP
+from .model import glm_get_params_for_weight_decay_optimization
+from .utils import get_checkpoint_iteration, get_checkpoint_name, print_rank_0
+
+
+def load_pretrained(model, checkpoint_path, args, task_tokens=None):
+    load_dir, tag, release, success = get_checkpoint_iteration(checkpoint_path)
+    checkpoint_name = get_checkpoint_name(load_dir, tag, release)
+    if mpu.get_data_parallel_rank() == 0:
+        print('global rank {} is loading pretrained model {}'.format(
+            torch.distributed.get_rank(), checkpoint_name))
+    # Load the checkpoint.
+    sd = torch.load(checkpoint_name, map_location='cpu')
+    if args.deepspeed:
+        model = model.module
+    if isinstance(model, TorchDDP):
+        model = model.module
+    if isinstance(model, FP16_Module):
+        model = model.module
+    if hasattr(model, 'model'):
+        model = model.model
+
+    # Model.
+    def extend_embedding_weights(state_weights, model_weights):
+        original_length = state_weights.shape[0]
+        assert original_length <= args.max_position_embeddings + 1
+        new_weights = model_weights.clone()
+        new_weights[:original_length] = state_weights
+        return new_weights
+
+    if args.block_lm:
+        if 'transformer.block_position_embeddings.weight' in sd['module']:
+            position_weights = sd['module'][
+                'transformer.position_embeddings.weight']
+            if args.max_position_embeddings + 1 > position_weights.shape[0]:
+                sd['module'][
+                    'transformer.position_embeddings.weight'] = extend_embedding_weights(
+                        position_weights,
+                        model.state_dict()
+                        ['transformer.position_embeddings.weight'].data)
+                print_rank_0(
+                    f'Extend position embedding to {args.max_position_embeddings + 1}'
+                )
+        if 'transformer.block_position_embeddings.weight' in sd['module']:
+            block_position_weights = sd['module'][
+                'transformer.block_position_embeddings.weight']
+            if args.max_position_embeddings + 1 > block_position_weights.shape[
+                    0]:
+                sd['module'][
+                    'transformer.block_position_embeddings.weight'] = extend_embedding_weights(
+                        block_position_weights,
+                        model.state_dict()
+                        ['transformer.block_position_embeddings.weight'].data)
+                print_rank_0(
+                    f'Extend block position embedding to {args.max_position_embeddings + 1}'
+                )
+    for key in list(model.state_dict().keys()):
+        print(key)
+        model.state_dict()[key.replace(
+            'mixins.block_position_embedding.block_position_embeddings.weight',
+            'transformer.block_position_embeddings.weight').replace(
+                'transformer.word_embeddings.weight',
+                'word_embeddings.weight')] = model.state_dict().pop(key)
+
+    missing_keys, unexpected_keys = model.load_state_dict(
+        sd['module'], strict=False)
+    if missing_keys or unexpected_keys:
+        print_rank_0(
+            f'Missing keys {missing_keys}, unexpected keys {unexpected_keys}')
+    if args.continuous_prompt and args.prompt_init:
+        model.prompt_spell.init_embedding(model.word_embeddings.weight.data,
+                                          task_tokens)
+
+
+def get_model(args,
+              model_type=None,
+              multi_token=True,
+              num_labels=None,
+              spell_length=None):
+    """Build the model."""
+    print_rank_0('building GPT2 model ...')
+    if args.pretrained_bert:
+        if model_type == 'multiple_choice':
+            model = BertForMultipleChoice.from_pretrained(
+                args.tokenizer_model_type,
+                cache_dir=args.cache_dir,
+                fp32_layernorm=args.fp32_layernorm,
+                fp32_embedding=args.fp32_embedding,
+                layernorm_epsilon=args.layernorm_epsilon)
+        elif model_type == 'classification':
+            model = BertForSequenceClassification.from_pretrained(
+                args.tokenizer_model_type,
+                cache_dir=args.cache_dir,
+                fp32_layernorm=args.fp32_layernorm,
+                fp32_embedding=args.fp32_embedding,
+                layernorm_epsilon=args.layernorm_epsilon,
+                num_labels=num_labels)
+        else:
+            raise NotImplementedError
+    else:
+        output_predict, paralle_output = True, True
+        if (model_type == 'multiple_choice'
+                or model_type == 'classification') and not args.cloze_eval:
+            output_predict = False
+        if model_type is not None:
+            paralle_output = False
+        if spell_length is not None:
+            print_rank_0(f'Continuous spell length {spell_length}')
+        model = GLMModel(
+            num_layers=args.num_layers,
+            vocab_size=args.vocab_size,
+            hidden_size=args.hidden_size,
+            num_attention_heads=args.num_attention_heads,
+            embedding_dropout_prob=args.hidden_dropout,
+            attention_dropout_prob=args.attention_dropout,
+            output_dropout_prob=args.hidden_dropout,
+            max_sequence_length=args.max_position_embeddings,
+            max_memory_length=args.mem_length,
+            checkpoint_activations=args.checkpoint_activations,
+            checkpoint_num_layers=args.checkpoint_num_layers,
+            parallel_output=paralle_output,
+            relative_encoding=args.transformer_xl,
+            block_position_encoding=args.block_lm and not args.masked_lm,
+            output_predict=output_predict,
+            spell_length=spell_length,
+            spell_func=args.prompt_func,
+            attention_scale=args.attention_scale)
+        if args.freeze_transformer:
+            model.freeze_transformer(
+                tune_prefix_layers=args.tune_prefix_layers)
+        if model_type is not None:
+            if model_type == 'multiple_choice':
+                if args.cloze_eval:
+                    if multi_token:
+                        if args.fast_decode:
+                            model = GLMForMultiTokenClozeFast(
+                                model, length_penalty=args.length_penalty)
+                        else:
+                            model = GLMForMultiTokenCloze(
+                                model, length_penalty=args.length_penalty)
+                    else:
+                        model = GLMForSingleTokenCloze(
+                            model, take_softmax=args.adapet)
+                else:
+                    model = GLMForSequenceClassification(
+                        model,
+                        args.hidden_size,
+                        args.output_dropout,
+                        args.pool_token,
+                        num_class=num_labels)
+            elif model_type == 'classification':
+                model = GLMForSequenceClassification(
+                    model,
+                    args.hidden_size,
+                    args.output_dropout,
+                    args.pool_token,
+                    num_class=num_labels)
+            elif model_type == 'generation':
+                pass
+            else:
+                raise NotImplementedError(model_type)
+
+    if mpu.get_data_parallel_rank() == 0:
+        print(
+            ' > number of parameters on model parallel rank {}: {}'.format(
+                mpu.get_model_parallel_rank(),
+                sum([p.nelement() for p in model.parameters()])),
+            flush=True)
+
+    # To prevent OOM for model sizes that cannot fit in GPU memory in full precision
+    if args.fp16:
+        model.half()
+
+    # GPU allocation.
+    model.cuda(torch.cuda.current_device())
+
+    # Fp16 conversion.
+    if args.fp16:
+        model = FP16_Module(model)
+
+    # Wrap model for distributed training.
+    if not args.deepspeed and (args.train_iters or args.epochs):
+        if args.DDP_impl == 'torch':
+            i = torch.cuda.current_device()
+            model = TorchDDP(
+                model,
+                device_ids=[i],
+                output_device=i,
+                process_group=mpu.get_data_parallel_group())
+        elif args.DDP_impl == 'local':
+            model = LocalDDP(model)
+        else:
+            print_rank_0('Skip DDP model')
+    return model
+
+
+def get_optimizer_param_groups(model):
+    # Build parameter groups (weight decay and non-decay).
+    while isinstance(model, (LocalDDP, TorchDDP, FP16_Module)):
+        model = model.module
+    param_groups = glm_get_params_for_weight_decay_optimization(model)
+
+    # Add model parallel attribute if it is not set.
+    for param_group in param_groups:
+        # print('## param_group', len(param_group['params']))
+        for param in param_group['params']:
+            if not hasattr(param, 'model_parallel'):
+                param.model_parallel = False
+
+    return param_groups
+
+
+def get_optimizer(param_groups, args):
+    """Set up the optimizer."""
+    if args.cpu_optimizer:
+        # Apex FusedAdam uses decoupled weight decay so use the same here
+        if args.cpu_torch_adam:
+            cpu_adam_optimizer = torch.optim.AdamW
+        else:
+            from deepspeed.ops.adam import DeepSpeedCPUAdam
+            cpu_adam_optimizer = DeepSpeedCPUAdam
+        optimizer = cpu_adam_optimizer(
+            param_groups, lr=args.lr, weight_decay=args.weight_decay)
+    else:
+        # Use FusedAdam.
+        if args.optimizer == 'adam':
+            optimizer = Adam(
+                param_groups,
+                lr=args.lr,
+                weight_decay=args.weight_decay,
+                betas=(args.adam_beta1, args.adam_beta2),
+                eps=args.adam_eps)
+        elif args.optimizer == 'adafactor':
+            from transformers import Adafactor
+            optimizer = Adafactor(
+                param_groups,
+                lr=args.lr,
+                relative_step=False,
+                warmup_init=False)
+        else:
+            raise NotImplementedError
+
+    print(f'Optimizer = {optimizer.__class__.__name__}')
+    if hasattr(args, 'deepspeed') and args.deepspeed:
+        raise NotImplementedError
+        # fp16 wrapper is not required for DeepSpeed.
+        # return optimizer
+
+    # Wrap into fp16 optimizer.
+    if args.fp16:
+        optimizer = FP16_Optimizer(
+            optimizer,
+            static_loss_scale=args.loss_scale,
+            dynamic_loss_scale=args.dynamic_loss_scale,
+            dynamic_loss_args={
+                'scale_window': args.loss_scale_window,
+                'min_scale': args.min_scale,
+                'delayed_shift': args.hysteresis
+            })
+
+    return optimizer
+
+
+def get_learning_rate_scheduler(optimizer, args):
+    """Build the learning rate scheduler."""
+
+    # Add linear learning rate scheduler.
+    if args.lr_decay_iters is not None:
+        num_iters = args.lr_decay_iters
+    else:
+        num_iters = args.train_iters
+    if args.finetune:
+        num_iters = num_iters // args.gradient_accumulation_steps
+    num_iters = max(1, num_iters)
+    init_step = -1
+    warmup_iter = args.warmup * num_iters
+    lr_scheduler = AnnealingLR(
+        optimizer,
+        start_lr=args.lr,
+        warmup_iter=warmup_iter,
+        num_iters=num_iters - warmup_iter,
+        decay_style=args.lr_decay_style,
+        last_iter=init_step,
+        decay_ratio=args.lr_decay_ratio)
+
+    return lr_scheduler
+
+
+def setup_model_and_optimizer(args,
+                              model_type=None,
+                              multi_token=True,
+                              num_labels=None,
+                              spell_length=None):
+    """Setup model and optimizer."""
+
+    model = get_model(
+        args,
+        model_type=model_type,
+        multi_token=multi_token,
+        num_labels=num_labels,
+        spell_length=spell_length)
+    param_groups = get_optimizer_param_groups(model)
+
+    if args.train_data is not None or args.data_dir is not None and (
+            args.epochs > 0 or args.train_iters > 0):
+        if args.deepspeed:
+            print_rank_0('DeepSpeed is enabled.')
+
+            model, optimizer, _, _ = deepspeed.initialize(
+                model=model,
+                model_parameters=param_groups,
+                args=args,
+                mpu=mpu,
+                dist_init_required=False)
+        else:
+            optimizer = get_optimizer(param_groups, args)
+        lr_scheduler = get_learning_rate_scheduler(optimizer, args)
+    else:
+        optimizer, lr_scheduler = None, None
+
+    return model, optimizer, lr_scheduler
+
+
+def backward_step(optimizer, model, lm_loss, args, timers):
+    """Backward step."""
+
+    # Total loss.
+    loss = lm_loss
+
+    # Backward pass.
+    if args.deepspeed:
+        model.backward(loss)
+    else:
+        # optimizer.zero_grad()
+        if args.fp16:
+            optimizer.backward(loss, update_master_grads=False)
+        else:
+            loss.backward()
+
+    if args.deepspeed or args.DDP_impl == 'torch':
+        # DeepSpeed backward propagation already addressed all reduce communication.
+        # Reset the timer to avoid breaking timer logs below.
+        timers('allreduce').reset()
+    else:
+        timers('allreduce').start()
+        model.allreduce_params(
+            reduce_after=False, fp32_allreduce=args.fp32_allreduce)
+        timers('allreduce').stop()
+
+    # Update master gradients.
+    if not args.deepspeed:
+        if args.fp16:
+            optimizer.update_master_grads()
+
+        # Clipping gradients helps prevent the exploding gradient.
+        if args.clip_grad > 0:
+            if not args.fp16:
+                mpu.clip_grad_norm(model.parameters(), args.clip_grad)
+            else:
+                optimizer.clip_master_grads(args.clip_grad)
+
+    return lm_loss
+
+
+def see_memory_usage(message, force=False):
+    if not force:
+        return
+    dist.barrier()
+    if dist.get_rank() == 0:
+        print(message)
+        print('Memory Allocated ',
+              torch.cuda.memory_allocated() / (1024 * 1024 * 1024),
+              'GigaBytes')
+        print('Max Memory Allocated ',
+              torch.cuda.max_memory_allocated() / (1024 * 1024 * 1024),
+              'GigaBytes')
+        print('Cache Allocated ',
+              torch.cuda.memory_cached() / (1024 * 1024 * 1024), 'GigaBytes')
+        print('Max cache Allocated ',
+              torch.cuda.max_memory_cached() / (1024 * 1024 * 1024),
+              'GigaBytes')
+        print(' ')
+        # input("Press Any Key To Continue ..")
+
+
+def train_step(data_iterator,
+               model,
+               optimizer,
+               lr_scheduler,
+               args,
+               timers,
+               forward_step_func,
+               mems=None,
+               single_step=False):
+    """Single training step."""
+    lm_loss_total, count = 0.0, 0
+    mems = [] if mems is None else mems
+    if not args.deepspeed:
+        optimizer.zero_grad()
+    while True:
+        skipped_iter, complete = 0, False
+        # Forward model for one step.
+        timers('forward').start()
+        lm_loss, mems, _ = forward_step_func(data_iterator, model, args,
+                                             timers, mems)
+        timers('forward').stop()
+        # print_rank_0("Forward step")
+        if not args.deepspeed:
+            lm_loss /= args.gradient_accumulation_steps
+
+        reduced_loss = lm_loss.detach().clone().view(1)
+        torch.distributed.all_reduce(
+            reduced_loss.data, group=mpu.get_data_parallel_group())
+        reduced_loss.data = reduced_loss.data / (
+            args.world_size / args.model_parallel_size)
+
+        if not DynamicLossScaler._has_inf_or_nan(reduced_loss):
+            lm_loss_total += reduced_loss
+            count += 1
+
+            # Calculate gradients, reduce across processes, and clip.
+            timers('backward').start()
+            backward_step(optimizer, model, lm_loss, args, timers)
+            timers('backward').stop()
+            # print_rank_0("Backward step")
+            # Update parameters.
+            timers('optimizer').start()
+            if args.deepspeed:
+                if model.is_gradient_accumulation_boundary():
+                    model.step()
+                    complete = True
+                    if not (args.fp16 and optimizer.overflow):
+                        lr_scheduler.step()
+                    else:
+                        skipped_iter = 1
+                else:
+                    model.step()
+            else:
+                if count == args.gradient_accumulation_steps:
+                    optimizer.step()
+                    complete = True
+                    # Update learning rate.
+                    if not (args.fp16 and optimizer.overflow):
+                        lr_scheduler.step()
+                    else:
+                        skipped_iter = 1
+            # print_rank_0("Optimizer step")
+            timers('optimizer').stop()
+            if complete:
+                break
+        else:
+            print_rank_0('Found NaN loss, skip backward')
+            del lm_loss, reduced_loss
+            mems = []
+        if single_step:
+            break
+    if args.deepspeed:
+        lm_loss_total = lm_loss_total / count
+    return lm_loss_total, skipped_iter, mems
diff --git a/modelscope/models/nlp/mglm/utils.py b/modelscope/models/nlp/mglm/utils.py
new file mode 100644
index 00000000..2bfcf8c0
--- /dev/null
+++ b/modelscope/models/nlp/mglm/utils.py
@@ -0,0 +1,529 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utilities for logging and serialization"""
+
+import os
+import random
+import subprocess
+import time
+
+import json
+import numpy as np
+import torch
+
+from . import mpu
+from .fp16 import FP16_Optimizer
+
+SUMMARY_WRITER_DIR_NAME = 'runs'
+
+
+def get_log_dir(name, base):
+    return os.path.join(base, SUMMARY_WRITER_DIR_NAME, name)
+
+
+def print_rank_0(message):
+    if torch.distributed.is_initialized():
+        if torch.distributed.get_rank() == 0:
+            print(message, flush=True)
+    else:
+        print(message, flush=True)
+
+
+def get_hostname():
+    hostname_cmd = ['hostname -I']
+    result = subprocess.check_output(hostname_cmd, shell=True)
+    master_addr = result.decode('utf-8').split()[0]
+    return master_addr
+
+
+def get_spare_port(args):
+    if torch.distributed.get_rank() == 0:
+        port = subprocess.check_output(['shuf -n 1 -i 10000-65535'],
+                                       shell=True)
+        port = int(port.strip())
+        if port == args.master_port:
+            port = subprocess.check_output(['shuf -n 1 -i 10000-65535'],
+                                           shell=True)
+            port = int(port.strip())
+        port = torch.cuda.LongTensor([port])
+    else:
+        port = torch.cuda.LongTensor([0])
+    torch.distributed.broadcast(port, 0)
+    port = port.item()
+    return port
+
+
+def print_and_save_args(args, verbose=True, log_dir=None):
+    """Print arguments."""
+    if verbose:
+        print('arguments:', flush=True)
+        for arg in vars(args):
+            dots = '.' * (29 - len(arg))
+            print(
+                '  {} {} {}'.format(arg, dots, getattr(args, arg)), flush=True)
+    if log_dir is not None:
+        json_file = os.path.join(log_dir, 'config.json')
+        with open(json_file, 'w') as output:
+            json.dump(vars(args), output, sort_keys=True)
+        if args.deepspeed and args.deepspeed_config is not None:
+            with open(args.deepspeed_config) as file:
+                deepspeed_config = json.load(file)
+            deepspeed_json_file = os.path.join(log_dir,
+                                               'config_gpt_large.json')
+            with open(deepspeed_json_file, 'w') as output:
+                json.dump(deepspeed_config, output)
+
+
+def print_params_min_max_norm(optimizer, iteration):
+    """Print min, max, and norm of all parameters."""
+    index = 0
+    rank = torch.distributed.get_rank()
+    string = 'iteration, rank, index, model-parallel,min, max, norm\n'
+    optimizer_ = optimizer
+    if isinstance(optimizer, FP16_Optimizer):
+        optimizer_ = optimizer.optimizer
+    for param_group in optimizer_.param_groups:
+        for param in param_group['params']:
+            index += 1
+            min_ = param.data.min()
+            max_ = param.data.max()
+            norm = param.data.norm()
+            string += '{:7d}, {:4d}, {:4d}, {:2d}, '.format(
+                iteration, rank, index, int(param.model_parallel))
+            string += '{:.6E}, {:.6E}, {:.6E}\n'.format(min_, max_, norm)
+    print(string, flush=True)
+
+
+class Timers:
+    """Group of timers."""
+
+    class Timer:
+        """Timer."""
+
+        def __init__(self, name):
+            self.name_ = name
+            self.elapsed_ = 0.0
+            self.started_ = False
+            self.start_time = time.time()
+
+        def start(self):
+            """Start the timer."""
+            assert not self.started_, 'timer has already been started'
+            torch.cuda.synchronize()
+            self.start_time = time.time()
+            self.started_ = True
+
+        def stop(self):
+            """Stop the timer."""
+            assert self.started_, 'timer is not started'
+            torch.cuda.synchronize()
+            self.elapsed_ += (time.time() - self.start_time)
+            self.started_ = False
+
+        def reset(self):
+            """Reset timer."""
+            self.elapsed_ = 0.0
+            self.started_ = False
+
+        def elapsed(self, reset=True):
+            """Calculate the elapsed time."""
+            started_ = self.started_
+            # If the timing in progress, end it first.
+            if self.started_:
+                self.stop()
+            # Get the elapsed time.
+            elapsed_ = self.elapsed_
+            # Reset the elapsed time
+            if reset:
+                self.reset()
+            # If timing was in progress, set it back.
+            if started_:
+                self.start()
+            return elapsed_
+
+    def __init__(self):
+        self.timers = {}
+
+    def __call__(self, name):
+        if name not in self.timers:
+            self.timers[name] = self.Timer(name)
+        return self.timers[name]
+
+    def log(self, names, normalizer=1.0, reset=True):
+        """Log a group of timers."""
+        assert normalizer > 0.0
+        string = 'time (ms)'
+        for name in names:
+            elapsed_time = self.timers[name].elapsed(
+                reset=reset) * 1000.0 / normalizer
+            string += ' | {}: {:.2f}'.format(name, elapsed_time)
+        print_rank_0(string)
+
+
+def report_memory(name):
+    """Simple GPU memory report."""
+
+    mega_bytes = 1024.0 * 1024.0
+    string = name + ' memory (MB)'
+    string += ' | allocated: {}'.format(torch.cuda.memory_allocated()
+                                        / mega_bytes)
+    string += ' | max allocated: {}'.format(torch.cuda.max_memory_allocated()
+                                            / mega_bytes)
+    string += ' | cached: {}'.format(torch.cuda.memory_cached() / mega_bytes)
+    string += ' | max cached: {}'.format(torch.cuda.memory_reserved()
+                                         / mega_bytes)
+    print_rank_0(string)
+
+
+def get_checkpoint_name(checkpoints_path,
+                        iteration,
+                        release=False,
+                        zero=False):
+    if release:
+        d = 'release'
+    else:
+        d = '{}'.format(iteration)
+    if zero:
+        dp_rank = mpu.get_data_parallel_rank()
+        d += '_zero_dp_rank_{}'.format(dp_rank)
+    return os.path.join(
+        checkpoints_path, d,
+        'mp_rank_{:02d}_model_states.pt'.format(mpu.get_model_parallel_rank()))
+
+
+def ensure_directory_exists(filename):
+    dirname = os.path.dirname(filename)
+    if not os.path.exists(dirname):
+        os.makedirs(dirname, exist_ok=True)
+
+
+def get_checkpoint_tracker_filename(checkpoints_path):
+    return os.path.join(checkpoints_path, 'latest_checkpointed_iteration.txt')
+
+
+def save_zero_checkpoint(args, iteration, optimizer):
+    zero_sd = {
+        'iteration': iteration,
+        'optimizer_state_dict': optimizer.state_dict()
+    }
+    zero_checkpoint_name = get_checkpoint_name(args.save, iteration, zero=True)
+    ensure_directory_exists(zero_checkpoint_name)
+    torch.save(zero_sd, zero_checkpoint_name)
+    print('  successfully saved {}'.format(zero_checkpoint_name))
+
+
+def save_checkpoint(iteration,
+                    model,
+                    optimizer,
+                    lr_scheduler,
+                    args,
+                    tag=None,
+                    barrier=True,
+                    only_changed_parameters=False,
+                    no_deepspeed=False,
+                    no_save_optim=False):
+    """Save a model checkpoint."""
+    if tag is None:
+        tag = str(iteration)
+    if args.deepspeed and not no_deepspeed:
+        save_ds_checkpoint(iteration, model, lr_scheduler, args, tag=tag)
+    else:
+        # Only rank zer0 of the data parallel writes to the disk.
+
+        if mpu.get_data_parallel_rank() == 0:
+            checkpoint_name = get_checkpoint_name(args.save, tag)
+            print(
+                'global rank {} is saving checkpoint at iteration {:7d} to {}'.
+                format(torch.distributed.get_rank(), iteration,
+                       checkpoint_name))
+            sd = {'iteration': iteration}
+            if args.deepspeed:
+                model = model.module
+            state_dict = model.state_dict()
+            if only_changed_parameters:
+                requires_grad_dict = {}
+                for name, parameter in model.named_parameters():
+                    requires_grad_dict[name] = parameter.requires_grad
+                state_dict = {
+                    key: value
+                    for key, value in state_dict.items()
+                    if requires_grad_dict[key]
+                }
+            sd['module'] = state_dict
+
+            # Optimizer stuff.
+            if not args.no_save_optim and not no_save_optim:
+                if optimizer is not None:
+                    sd['optimizer'] = optimizer.state_dict()
+                if lr_scheduler is not None:
+                    sd['lr_scheduler'] = lr_scheduler.state_dict()
+
+            # rng states.
+            if not args.no_save_rng:
+                sd['random_rng_state'] = random.getstate()
+                sd['np_rng_state'] = np.random.get_state()
+                sd['torch_rng_state'] = torch.get_rng_state()
+                sd['cuda_rng_state'] = torch.cuda.get_rng_state()
+                sd['rng_tracker_states'] = mpu.get_cuda_rng_tracker(
+                ).get_states()
+
+            ensure_directory_exists(checkpoint_name)
+            torch.save(sd, checkpoint_name)
+            print('  successfully saved {}'.format(checkpoint_name))
+
+    # Wait so everyone is done (necessary)
+    if barrier:
+        torch.distributed.barrier()
+    # And update the latest iteration
+    if torch.distributed.get_rank() == 0:
+        tracker_filename = get_checkpoint_tracker_filename(args.save)
+        with open(tracker_filename, 'w') as f:
+            f.write(tag)
+
+
+def save_ds_checkpoint(iteration, model, lr_scheduler, args, tag):
+    """Save a model checkpoint."""
+
+    sd = {}
+    sd['iteration'] = iteration
+    if lr_scheduler is not None:
+        sd['client_lr_scheduler'] = lr_scheduler.state_dict()
+    # rng states.
+    if not args.no_save_rng:
+        sd['random_rng_state'] = random.getstate()
+        sd['np_rng_state'] = np.random.get_state()
+        sd['torch_rng_state'] = torch.get_rng_state()
+        sd['cuda_rng_state'] = torch.cuda.get_rng_state()
+        sd['rng_tracker_states'] = mpu.get_cuda_rng_tracker().get_states()
+    model.save_checkpoint(args.save, tag, client_state=sd)
+
+
+def get_checkpoint_iteration(load_path):
+    # Read the tracker file and set the iteration.
+    tracker_filename = get_checkpoint_tracker_filename(load_path)
+    if not os.path.isfile(tracker_filename):
+        print_rank_0('WARNING: could not find the metadata file {} '.format(
+            tracker_filename))
+        if os.path.isdir(load_path):
+            path = os.path.normpath(load_path)
+            load_dir, tag = os.path.split(path)
+            print_rank_0(
+                'Try to directly load the checkpoint from the directory')
+            return load_dir, tag, False, True
+        print_rank_0('    will not load any checkpoints and will start from '
+                     'random')
+        return load_path, 0, False, False
+    with open(tracker_filename, 'r') as f:
+        metastring = f.read().strip()
+        release = metastring == 'release'
+        # try:
+        #     iteration = int(metastring)
+        # except ValueError:
+        #     release = metastring == 'release'
+        #     if not release:
+        #         print_rank_0('ERROR: Invalid metadata file {}. Exiting'.format(
+        #             tracker_filename))
+        #         exit()
+
+    # assert iteration > 0 or release, 'error parsing metadata file {}'.format(
+    #     tracker_filename)
+
+    return load_path, metastring, release, True
+
+
+def load_checkpoint(model,
+                    optimizer,
+                    lr_scheduler,
+                    args,
+                    no_deepspeed=False,
+                    no_load_optim=False):
+    """Load a model checkpoint."""
+
+    load_dir, tag, release, success = get_checkpoint_iteration(args.load)
+
+    if not success:
+        return 0
+
+    if args.deepspeed and not no_deepspeed:
+
+        checkpoint_name, sd = model.load_checkpoint(
+            load_dir,
+            tag,
+            load_optimizer_states=not args.no_load_optim and not no_load_optim,
+            load_lr_scheduler_states=not args.no_load_lr_scheduler)
+        if not args.no_load_lr_scheduler and 'client_lr_scheduler' in sd:
+            lr_scheduler.load_state_dict(sd['client_lr_scheduler'])
+            print_rank_0('Load lr scheduler state')
+        if checkpoint_name is None:
+            if mpu.get_data_parallel_rank() == 0:
+                print('Unable to load checkpoint.')
+            return tag
+
+    else:
+
+        # Checkpoint.
+        checkpoint_name = get_checkpoint_name(load_dir, tag, release)
+
+        if mpu.get_data_parallel_rank() == 0:
+            print('global rank {} is loading checkpoint {}'.format(
+                torch.distributed.get_rank(), checkpoint_name))
+
+        # Load the checkpoint.
+        sd = torch.load(checkpoint_name, map_location='cpu')
+
+        # Model.
+        if args.deepspeed:
+            model = model.module
+        missing_keys, unexpected_keys = model.load_state_dict(
+            sd['module'], strict=False)
+        if missing_keys or unexpected_keys:
+            print_rank_0(
+                f'Missing keys {missing_keys}, unexpected keys {unexpected_keys}'
+            )
+
+        # Optimizer.
+        if not release and not args.finetune and not args.no_load_optim and not no_load_optim:
+            try:
+                if optimizer is not None:
+                    optimizer.load_state_dict(sd['optimizer'])
+                if lr_scheduler is not None:
+                    lr_scheduler.load_state_dict(sd['lr_scheduler'])
+            except KeyError:
+                print_rank_0(
+                    'Unable to load optimizer from checkpoint {}, exiting. '
+                    'Specify --no-load-optim or --finetune to prevent '
+                    'attempting to load the optimizer '
+                    'state.'.format(checkpoint_name))
+
+    # Iterations.
+    if args.finetune or release:
+        iteration = 0
+    else:
+        try:
+            iteration = sd['iteration']
+        except KeyError:
+            try:  # Backward compatible with older checkpoints
+                iteration = sd['total_iters']
+            except KeyError:
+                print_rank_0(
+                    'A metadata file exists but Unable to load iteration '
+                    ' from checkpoint {}, starting from 0 iteration'.format(
+                        checkpoint_name))
+                iteration = 0
+
+    # rng states.
+    if not release and not args.finetune and not args.no_load_rng:
+        try:
+            random.setstate(sd['random_rng_state'])
+            np.random.set_state(sd['np_rng_state'])
+            torch.set_rng_state(sd['torch_rng_state'])
+            torch.cuda.set_rng_state(sd['cuda_rng_state'])
+            mpu.get_cuda_rng_tracker().set_states(sd['rng_tracker_states'])
+        except KeyError:
+            print_rank_0(
+                'Unable to load random state from checkpoint {}, exiting. '
+                'Specify --no-load-rng or --finetune to prevent '
+                'attempting to load the random '
+                'state.'.format(checkpoint_name))
+
+    if mpu.get_data_parallel_rank() == 0:
+        print('  successfully loaded {}'.format(checkpoint_name))
+
+    return iteration
+
+
+def load_weights(src, dst, dst2src=False):
+    """
+    Loads weights from src to dst via in place copy.
+    src is a huggingface gpt2model, while dst is one of our models.
+    dst2src=True loads parameters from our models into huggingface's.
+    ^dst2src is still untested
+    """
+    conv_layer = 'Conv1D' in str(type(src))
+    for n, p in src.named_parameters():
+        if dst2src:
+            data = dst._parameters[n].data
+            load = p.data
+        else:
+            data = p.data
+            load = dst._parameters[n].data
+        if conv_layer and 'weight' in n:
+            data = data.t().contiguous()
+        load.copy_(data)
+
+
+#        dst._parameters[n].data.copy_(data)
+
+
+def load_mlp(our, oai, dst2src=False):
+    load_weights(oai.c_fc, our.dense_h_to_4h, dst2src)
+    load_weights(oai.c_proj, our.dense_4h_to_h, dst2src)
+
+
+def load_attention(our, oai, dst2src=False):
+    load_weights(oai.c_attn, our.query_key_value, dst2src)
+    load_weights(oai.c_proj, our.dense, dst2src)
+
+
+def load_transformer_layer(our, oai, dst2src=False):
+    load_weights(oai.ln_1, our.input_layernorm, dst2src)
+    load_weights(oai.ln_2, our.post_attention_layernorm, dst2src)
+    load_mlp(our.mlp, oai.mlp, dst2src)
+    load_attention(our.attention, oai.attn, dst2src)
+
+
+def move_weights(our, oai, dst2src=False):
+    """
+    Loads weights from `oai` to `our` via in place copy.
+    `oai` is a huggingface gpt2model, while `our` is one of our models.
+    dst2src=True loads parameters from our models into huggingface's.
+    ^dst2src=True is still untested
+    """
+    #    while isinstance(our, (torchDDP, model.distributed.DistributedDataParallel, FP16_Module)):
+    #        our=our.module
+    transformer_model = oai.transformer
+    load_weights(transformer_model.ln_f, our.transformer.final_layernorm,
+                 dst2src)
+    load_weights(transformer_model.wte, our.word_embeddings, dst2src)
+    load_weights(transformer_model.wpe, our.position_embeddings, dst2src)
+
+    for our_layer, oai_layer in zip(our.transformer.layers, oai.transformer.h):
+        load_transformer_layer(our_layer, oai_layer, dst2src)
+
+
+def debug_finetune_data(local_vars, batch_id, tokenizer):
+    tokens, target_ids = local_vars['tokens'], local_vars['target_ids']
+    attention_mask, logit_mask, position_ids = local_vars[
+        'attention_mask'], local_vars['logit_mask'], local_vars['position_ids']
+    output_tokens = []
+    sep = attention_mask[batch_id].item()
+    for i, token in enumerate(tokens[batch_id][:sep].tolist()):
+        token = tokenizer.IdToToken(token)
+        if token == '[MASK]':
+            token = f'[{position_ids[batch_id][0, i].item()}]'
+        output_tokens.append(token)
+    print(' '.join(output_tokens))
+    target_positions = []
+    for i in range(sep, tokens.size(-1)):
+        if logit_mask[batch_id][i]:
+            target_positions.append(i)
+    print(target_positions)
+    print(tokenizer.DecodeIds(tokens[batch_id][target_positions].tolist()))
+    if len(target_ids.shape) > 2:
+        print(
+            tokenizer.DecodeIds(
+                target_ids[batch_id][target_positions].tolist()))
+    else:
+        print(tokenizer.DecodeIds(target_ids[batch_id].tolist()))
+    print(position_ids[batch_id][:, target_positions])
diff --git a/modelscope/outputs/outputs.py b/modelscope/outputs/outputs.py
index cbdeede4..b983125a 100644
--- a/modelscope/outputs/outputs.py
+++ b/modelscope/outputs/outputs.py
@@ -516,6 +516,12 @@ TASK_OUTPUTS = {
     # }
     Tasks.text_generation: [OutputKeys.TEXT],
 
+    # summarization result for single sample
+    # {
+    #   "text": "this is the text generated by a model."
+    # }
+    Tasks.text_summarization: [OutputKeys.TEXT],
+
     # text generation result for single sample
     # {
     #   "text": "北京"
diff --git a/modelscope/pipelines/nlp/__init__.py b/modelscope/pipelines/nlp/__init__.py
index 7b726308..1206ae08 100644
--- a/modelscope/pipelines/nlp/__init__.py
+++ b/modelscope/pipelines/nlp/__init__.py
@@ -31,6 +31,7 @@ if TYPE_CHECKING:
     from .translation_pipeline import TranslationPipeline
     from .word_segmentation_pipeline import WordSegmentationPipeline
     from .zero_shot_classification_pipeline import ZeroShotClassificationPipeline
+    from .mglm_text_summarization_pipeline import MGLMTextSummarizationPipeline
     from .multilingual_word_segmentation_pipeline import MultilingualWordSegmentationPipeline, \
         WordSegmentationThaiPipeline
 
@@ -71,6 +72,7 @@ else:
         'word_segmentation_pipeline': ['WordSegmentationPipeline'],
         'zero_shot_classification_pipeline':
         ['ZeroShotClassificationPipeline'],
+        'mglm_text_summarization_pipeline': ['MGLMTextSummarizationPipeline'],
         'multilingual_word_segmentation_pipeline': [
             'MultilingualWordSegmentationPipeline',
             'WordSegmentationThaiPipeline'
diff --git a/modelscope/pipelines/nlp/mglm_text_summarization_pipeline.py b/modelscope/pipelines/nlp/mglm_text_summarization_pipeline.py
new file mode 100644
index 00000000..c6d03077
--- /dev/null
+++ b/modelscope/pipelines/nlp/mglm_text_summarization_pipeline.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2022 Zhipu.AI
+
+from typing import Any, Dict, Optional, Union
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.base import Model
+from modelscope.models.nlp import MGLMForTextSummarization
+from modelscope.pipelines.base import Pipeline, Tensor
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import (MGLMSummarizationPreprocessor,
+                                      Preprocessor)
+from modelscope.utils.constant import Tasks
+
+__all__ = ['MGLMTextSummarizationPipeline']
+
+
+@PIPELINES.register_module(
+    group_key=Tasks.text_summarization,
+    module_name=Pipelines.mglm_text_summarization)
+class MGLMTextSummarizationPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[MGLMForTextSummarization, str],
+                 preprocessor: [Preprocessor] = None,
+                 *args,
+                 **kwargs):
+        model = MGLMForTextSummarization(model) if isinstance(model,
+                                                              str) else model
+        self.model = model
+        self.model.eval()
+        if preprocessor is None:
+            preprocessor = MGLMSummarizationPreprocessor()
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+
+    # define the forward pass
+    def forward(self, inputs: Union[Dict, str],
+                **forward_params) -> Dict[str, Any]:
+        inputs = {'text': inputs} if isinstance(inputs, str) else inputs
+        return self.model.generate(inputs)
+
+    # format the outputs from pipeline
+    def postprocess(self, input, **kwargs) -> Dict[str, Any]:
+        return input
diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py
index e568098f..0db1c7e0 100644
--- a/modelscope/preprocessors/__init__.py
+++ b/modelscope/preprocessors/__init__.py
@@ -18,16 +18,16 @@ if TYPE_CHECKING:
     from .nlp import (
         DocumentSegmentationPreprocessor, FaqQuestionAnsweringPreprocessor,
         FillMaskPoNetPreprocessor, NLPPreprocessor,
-        NLPTokenizerPreprocessorBase, TextRankingPreprocessor,
-        RelationExtractionPreprocessor, SentenceEmbeddingPreprocessor,
-        SequenceClassificationPreprocessor, TokenClassificationPreprocessor,
-        TextErrorCorrectionPreprocessor, TextGenerationPreprocessor,
-        Text2TextGenerationPreprocessor, Tokenize,
+        NLPTokenizerPreprocessorBase, PassageRankingPreprocessor,
+        TextRankingPreprocessor, RelationExtractionPreprocessor,
+        SentenceEmbeddingPreprocessor, SequenceClassificationPreprocessor,
+        TokenClassificationPreprocessor, TextErrorCorrectionPreprocessor,
+        TextGenerationPreprocessor, Text2TextGenerationPreprocessor, Tokenize,
         WordSegmentationBlankSetToLabelPreprocessor,
-        ZeroShotClassificationPreprocessor, TextGenerationJiebaPreprocessor,
-        SentencePiecePreprocessor, DialogIntentPredictionPreprocessor,
-        DialogModelingPreprocessor, DialogStateTrackingPreprocessor,
-        ConversationalTextToSqlPreprocessor,
+        MGLMSummarizationPreprocessor, ZeroShotClassificationPreprocessor,
+        TextGenerationJiebaPreprocessor, SentencePiecePreprocessor,
+        DialogIntentPredictionPreprocessor, DialogModelingPreprocessor,
+        DialogStateTrackingPreprocessor, ConversationalTextToSqlPreprocessor,
         TableQuestionAnsweringPreprocessor, NERPreprocessorViet,
         NERPreprocessorThai, WordSegmentationPreprocessorThai)
     from .video import ReadVideoData, MovieSceneSegmentationPreprocessor
@@ -57,6 +57,7 @@ else:
             'TextErrorCorrectionPreprocessor', 'TextGenerationPreprocessor',
             'Tokenize', 'Text2TextGenerationPreprocessor',
             'WordSegmentationBlankSetToLabelPreprocessor',
+            'MGLMSummarizationPreprocessor',
             'ZeroShotClassificationPreprocessor',
             'TextGenerationJiebaPreprocessor', 'SentencePiecePreprocessor',
             'NERPreprocessorViet', 'NERPreprocessorThai',
diff --git a/modelscope/preprocessors/nlp/__init__.py b/modelscope/preprocessors/nlp/__init__.py
index d9c55fe1..7c48fb3c 100644
--- a/modelscope/preprocessors/nlp/__init__.py
+++ b/modelscope/preprocessors/nlp/__init__.py
@@ -29,6 +29,7 @@ if TYPE_CHECKING:
                         MultiWOZBPETextField, IntentBPETextField)
     from .space_T_en import ConversationalTextToSqlPreprocessor
     from .space_T_cn import TableQuestionAnsweringPreprocessor
+    from .mglm_summarization_preprocessor import MGLMSummarizationPreprocessor
 else:
     _import_structure = {
         'nlp_base': [
@@ -62,6 +63,7 @@ else:
         'text_error_correction': [
             'TextErrorCorrectionPreprocessor',
         ],
+        'mglm_summarization_preprocessor': ['MGLMSummarizationPreprocessor'],
         'token_classification_thai_preprocessor': [
             'NERPreprocessorThai',
             'WordSegmentationPreprocessorThai',
diff --git a/modelscope/preprocessors/nlp/mglm_summarization_preprocessor.py b/modelscope/preprocessors/nlp/mglm_summarization_preprocessor.py
new file mode 100644
index 00000000..0a68a9fa
--- /dev/null
+++ b/modelscope/preprocessors/nlp/mglm_summarization_preprocessor.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2022 Zhipu.AI
+
+import os.path as osp
+import re
+from typing import Any, Dict, Iterable, Optional, Tuple, Union
+
+from modelscope.metainfo import Models, Preprocessors
+from modelscope.outputs import OutputKeys
+from modelscope.preprocessors.base import Preprocessor
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.utils.config import Config, ConfigFields
+from modelscope.utils.constant import Fields, InputFields, ModeKeys, ModelFile
+from modelscope.utils.hub import get_model_type, parse_label_mapping
+from modelscope.utils.logger import get_logger
+from modelscope.utils.nlp import import_external_nltk_data
+from modelscope.utils.type_assert import type_assert
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.mglm_summarization)
+class MGLMSummarizationPreprocessor(Preprocessor):
+
+    def __init__(self, *args, **kwargs):
+        """preprocess the data
+        Args:
+            model_dir (str): model path
+        """
+        super().__init__(*args, **kwargs)
+
+    @type_assert(object, (str, tuple, Dict))
+    def __call__(self, data: Union[str, tuple, Dict]) -> Dict[str, Any]:
+        return data
diff --git a/requirements/nlp.txt b/requirements/nlp.txt
index 9a4abd71..80fee546 100644
--- a/requirements/nlp.txt
+++ b/requirements/nlp.txt
@@ -1,18 +1,25 @@
+boto3
 en_core_web_sm>=2.3.5
+fasttext
+filelock
+ftfy
 jieba>=0.42.1
-megatron_util
+matplotlib
+nltk
 pai-easynlp
+pandas
 # protobuf version beyond 3.20.0 is not compatible with TensorFlow 1.x, therefore is discouraged.
 protobuf>=3.19.0,<3.21.0
 pythainlp
 pyvi
-# rough-score was just recently updated from 0.0.4 to 0.0.7
-# which introduced compatability issues that are being investigated
-rouge_score<=0.0.4
+regex
 sacremoses>=0.0.41
+scikit_learn
+sentencepiece
 seqeval
 spacy>=2.3.5
 subword_nmt>=0.3.8
+termcolor
 text2sql_lgesql
 tokenizers
 transformers>=4.12.0
diff --git a/tests/pipelines/test_mglm_text_summarization.py b/tests/pipelines/test_mglm_text_summarization.py
new file mode 100644
index 00000000..47abc741
--- /dev/null
+++ b/tests/pipelines/test_mglm_text_summarization.py
@@ -0,0 +1,47 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import unittest
+
+from modelscope.models import Model
+from modelscope.pipelines import pipeline
+from modelscope.preprocessors import MGLMSummarizationPreprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class mGLMTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.output_dir = 'unittest_output'
+        os.makedirs(self.output_dir, exist_ok=True)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_mglm_with_name(self):
+        model = 'ZhipuAI/Multilingual-GLM-Summarization-zh'
+        preprocessor = MGLMSummarizationPreprocessor()
+        pipe = pipeline(
+            task=Tasks.text_summarization,
+            model=model,
+            preprocessor=preprocessor,
+        )
+        result = pipe(
+            '据中国载人航天工程办公室消息，北京时间2022年10月25日，梦天实验舱与长征五号B遥四运载火箭组合体已转运至发射区。后续将按计划开展发射前各项功能检查和联合测试等工作，计划于近日择机实施发射。目前，文昌航天发射场设施设备状态良好，参试各单位正在加紧开展任务准备，全力以赴确保空间站建造任务决战决胜。'  # noqa
+        )
+        print(result)
+
+        model = 'ZhipuAI/Multilingual-GLM-Summarization-en'
+        preprocessor = MGLMSummarizationPreprocessor()
+        pipe = pipeline(
+            task=Tasks.text_summarization,
+            model=model,
+            preprocessor=preprocessor,
+        )
+        result = pipe(
+            '据中国载人航天工程办公室消息，北京时间2022年10月25日，梦天实验舱与长征五号B遥四运载火箭组合体已转运至发射区。后续将按计划开展发射前各项功能检查和联合测试等工作，计划于近日择机实施发射。目前，文昌航天发射场设施设备状态良好，参试各单位正在加紧开展任务准备，全力以赴确保空间站建造任务决战决胜。'  # noqa
+        )
+        print(result)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 0c8ab1d137b0079d8fd461c9af9f21785c651e99 Mon Sep 17 00:00:00 2001
From: Yingda Chen <yingdachen@apache.org>
Date: Tue, 1 Nov 2022 23:43:09 +0800
Subject: [PATCH 15/29] Update README.md

---
 README.md | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 61c3207a..fe104fa6 100644
--- a/README.md
+++ b/README.md
@@ -1,16 +1,17 @@
 # Introduction
 
-ModelScope library is targeted to support training, evaluation and inference for the state of the art models provided by Mind and further support third-party models provided by users outside alibaba.
+[ModelScope]( https://www.modelscope.cn) is a “Model-as-a-Service” (MaaS) platform that seeks to bringing together most advanced machine learning models from the AI community, and to streamlining the process of leveraging and applying AI models . The core ModelScope library enables developers to perform model inference, training and evaluation, through rich layers of API designs that facilitate a unified experience across state-of-the-art models from different AI domains.
 
-In order to enable ModelScope users to use the various models provided by ModelScope quickly and conveniently, we provide a set of complete Python library, which includes the implementation of ModelScope official models, inference, finetuning and evaluation support for those models such as preprocessor and evaluation metrics. We also provide easy-to-use APIs and rich usage examples. By calling the library, users can write just a few lines of code to complete tasks such as model inference, training, and evaluation, and can also quickly carry out secondary development on this basis to realize their own innovative ideas.
+The Python library offers the layered-APIs necessary for model contributors to integrate models from CV, NLP, Speech, Multi-Modal, as well as Scientific-computation, into the ModelScope ecosystem. Implementations for all these different models are encapsulated within the library in a way that allows easy and unified access. With such integration, model inference, finetuning, and evaluations can be done within only a few lines of codes. In the meantime, flexibilities are provided so that different components in the model applications can be customized as well, where necessary.
+
+Apart from harboring implementations of various models, ModelScope library also enables the necessary interactions with the backend services of ModelScope, particularly with the Model-Hub and Dataset-Hub. Such interactions facilitate various entity (models and datasets) management to be performed seamlessly under-the-hood, such as entity lookup, version control, and cache management. 
 
-At present, the algorithm models provided by library cover four main AI fields of image, natural language processing, speech, and multi-modality, and dozens of application scenarios and tasks.
 
 # Installation
 
 Please refer to [installation](https://modelscope.cn/docs/%E7%8E%AF%E5%A2%83%E5%AE%89%E8%A3%85).
 
-# Get Started
+# Getting Started
 
 You can refer to [quick_start](https://modelscope.cn/docs/%E5%BF%AB%E9%80%9F%E5%BC%80%E5%A7%8B) for quick start.
 

From 665c496e202fef24db4ca93cd56d863c745fae1f Mon Sep 17 00:00:00 2001
From: Yingda Chen <yingda.chen@alibaba-inc.com>
Date: Wed, 2 Nov 2022 00:14:25 +0800
Subject: [PATCH 16/29] Revert "Update README.md"

This reverts commit 0c8ab1d137b0079d8fd461c9af9f21785c651e99.
---
 README.md | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index fe104fa6..61c3207a 100644
--- a/README.md
+++ b/README.md
@@ -1,17 +1,16 @@
 # Introduction
 
-[ModelScope]( https://www.modelscope.cn) is a “Model-as-a-Service” (MaaS) platform that seeks to bringing together most advanced machine learning models from the AI community, and to streamlining the process of leveraging and applying AI models . The core ModelScope library enables developers to perform model inference, training and evaluation, through rich layers of API designs that facilitate a unified experience across state-of-the-art models from different AI domains.
+ModelScope library is targeted to support training, evaluation and inference for the state of the art models provided by Mind and further support third-party models provided by users outside alibaba.
 
-The Python library offers the layered-APIs necessary for model contributors to integrate models from CV, NLP, Speech, Multi-Modal, as well as Scientific-computation, into the ModelScope ecosystem. Implementations for all these different models are encapsulated within the library in a way that allows easy and unified access. With such integration, model inference, finetuning, and evaluations can be done within only a few lines of codes. In the meantime, flexibilities are provided so that different components in the model applications can be customized as well, where necessary.
-
-Apart from harboring implementations of various models, ModelScope library also enables the necessary interactions with the backend services of ModelScope, particularly with the Model-Hub and Dataset-Hub. Such interactions facilitate various entity (models and datasets) management to be performed seamlessly under-the-hood, such as entity lookup, version control, and cache management. 
+In order to enable ModelScope users to use the various models provided by ModelScope quickly and conveniently, we provide a set of complete Python library, which includes the implementation of ModelScope official models, inference, finetuning and evaluation support for those models such as preprocessor and evaluation metrics. We also provide easy-to-use APIs and rich usage examples. By calling the library, users can write just a few lines of code to complete tasks such as model inference, training, and evaluation, and can also quickly carry out secondary development on this basis to realize their own innovative ideas.
 
+At present, the algorithm models provided by library cover four main AI fields of image, natural language processing, speech, and multi-modality, and dozens of application scenarios and tasks.
 
 # Installation
 
 Please refer to [installation](https://modelscope.cn/docs/%E7%8E%AF%E5%A2%83%E5%AE%89%E8%A3%85).
 
-# Getting Started
+# Get Started
 
 You can refer to [quick_start](https://modelscope.cn/docs/%E5%BF%AB%E9%80%9F%E5%BC%80%E5%A7%8B) for quick start.
 

From f0e92bf5f2eedc1c895d28fee3a7f4855773a7b6 Mon Sep 17 00:00:00 2001
From: "wenmeng.zwm" <wenmeng.zwm@alibaba-inc.com>
Date: Tue, 15 Nov 2022 18:57:09 +0800
Subject: [PATCH 17/29] upgrade flake8 precommit repo to fix linter test failed

---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 48fe7547..68fc8484 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,5 +1,5 @@
 repos:
-  - repo: https://gitlab.com/pycqa/flake8.git
+  - repo: https://github.com/PyCQA/flake8
     rev: 4.0.0
     hooks:
       - id: flake8

From db0f25a5947c49b62cac7b99309a18540be4b929 Mon Sep 17 00:00:00 2001
From: shuaigezhu <zhuyufengca@gmail.com>
Date: Tue, 22 Nov 2022 10:10:34 +0800
Subject: [PATCH 18/29] init

---
 modelscope/metainfo.py                        |    3 +
 modelscope/models/nlp/__init__.py             |    2 +
 modelscope/models/nlp/codegeex/__init__.py    |   22 +
 modelscope/models/nlp/codegeex/codegeex.py    | 1030 +++++++++++++++++
 .../codegeex/codegeex_for_code_translation.py |  126 ++
 modelscope/models/nlp/codegeex/inference.py   |  335 ++++++
 modelscope/models/nlp/codegeex/tokenizer.py   |  186 +++
 modelscope/pipelines/nlp/__init__.py          |    3 +
 .../nlp/codegeex_code_translation_pipeline.py |   44 +
 modelscope/preprocessors/__init__.py          |    4 +-
 modelscope/preprocessors/nlp/__init__.py      |    2 +
 .../nlp/codegeex_preprocessor.py              |   25 +
 modelscope/utils/constant.py                  |    1 +
 .../test_CodeGeeX_code_translation.py         |   38 +
 14 files changed, 1819 insertions(+), 2 deletions(-)
 create mode 100755 modelscope/models/nlp/codegeex/__init__.py
 create mode 100755 modelscope/models/nlp/codegeex/codegeex.py
 create mode 100755 modelscope/models/nlp/codegeex/codegeex_for_code_translation.py
 create mode 100755 modelscope/models/nlp/codegeex/inference.py
 create mode 100755 modelscope/models/nlp/codegeex/tokenizer.py
 create mode 100755 modelscope/pipelines/nlp/codegeex_code_translation_pipeline.py
 create mode 100755 modelscope/preprocessors/nlp/codegeex_preprocessor.py
 create mode 100644 tests/pipelines/test_CodeGeeX_code_translation.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index ccd36349..99f4a047 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -84,6 +84,7 @@ class Models(object):
     ponet = 'ponet'
     T5 = 'T5'
     mglm = 'mglm'
+    codegeex = 'codegeex'
     bloom = 'bloom'
 
     # audio models
@@ -255,6 +256,7 @@ class Pipelines(object):
     document_segmentation = 'document-segmentation'
     feature_extraction = 'feature-extraction'
     mglm_text_summarization = 'mglm-text-summarization'
+    codegeex_code_translation = 'codegeex-code-translation'
     translation_en_to_de = 'translation_en_to_de'  # keep it underscore
     translation_en_to_ro = 'translation_en_to_ro'  # keep it underscore
     translation_en_to_fr = 'translation_en_to_fr'  # keep it underscore
@@ -382,6 +384,7 @@ class Preprocessors(object):
     document_segmentation = 'document-segmentation'
     feature_extraction = 'feature-extraction'
     mglm_summarization = 'mglm-summarization'
+    codegeex = 'codegeex'
     sentence_piece = 'sentence-piece'
 
     # audio preprocessor
diff --git a/modelscope/models/nlp/__init__.py b/modelscope/models/nlp/__init__.py
index 1d71469a..3f9d224c 100644
--- a/modelscope/models/nlp/__init__.py
+++ b/modelscope/models/nlp/__init__.py
@@ -36,6 +36,7 @@ if TYPE_CHECKING:
     )
     from .T5 import T5ForConditionalGeneration
     from .mglm import MGLMForTextSummarization
+    from .codegeex import CodeGeeXForCodeTranslation
     from .task_models import (
         FeatureExtractionModel,
         InformationExtractionModel,
@@ -108,6 +109,7 @@ else:
         'sentence_embedding': ['SentenceEmbedding'],
         'T5': ['T5ForConditionalGeneration'],
         'mglm': ['MGLMForTextSummarization'],
+        'codegeex': ['CodeGeeXForCodeTranslation'],
         'gpt_neo': ['GPTNeoModel'],
         'bloom': ['BloomModel'],
     }
diff --git a/modelscope/models/nlp/codegeex/__init__.py b/modelscope/models/nlp/codegeex/__init__.py
new file mode 100755
index 00000000..6ee72f80
--- /dev/null
+++ b/modelscope/models/nlp/codegeex/__init__.py
@@ -0,0 +1,22 @@
+# Modified by Zhipu.AI
+# Original Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .codegeex_for_code_translation import CodeGeeXForCodeTranslation
+else:
+    _import_structure = {
+        'codegeex_for_code_translation': ['CodeGeeXForCodeTranslation'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/nlp/codegeex/codegeex.py b/modelscope/models/nlp/codegeex/codegeex.py
new file mode 100755
index 00000000..7a1b76a3
--- /dev/null
+++ b/modelscope/models/nlp/codegeex/codegeex.py
@@ -0,0 +1,1030 @@
+import math
+
+import torch
+import torch.nn.functional as F
+from torch.nn.parameter import Parameter
+
+
+def fast_gelu(x):
+    """Mindspore's fast gelu implementation."""
+    return x / (1 + torch.exp(-1.702 * torch.abs(x))) * torch.exp(
+        0.851 * (x - torch.abs(x)))
+
+
+class MLP(torch.nn.Module):
+    """MLP.
+
+    MLP will take the input with h hidden state, project it to 4*h
+    hidden dimension, perform nonlinear transformation, and project the
+    state back into h hidden dimension. At the end, dropout is also
+    applied.
+    """
+
+    def __init__(
+        self,
+        hidden_size,
+    ):
+        super(MLP, self).__init__()
+        self.hidden_size = hidden_size
+        # Project to 4h.
+        self.dense_h_to_4h = torch.nn.Linear(
+            self.hidden_size,
+            4 * self.hidden_size,
+        )
+
+        self.activation_func = fast_gelu
+
+        # Project back to h.
+        self.dense_4h_to_h = torch.nn.Linear(
+            4 * self.hidden_size,
+            self.hidden_size,
+        )
+
+    def forward(self, hidden_states):
+        # [s, b, 4hp]
+        intermediate_parallel = self.dense_h_to_4h(hidden_states)
+        intermediate_parallel = self.activation_func(intermediate_parallel)
+        # [s, b, h]
+        output = self.dense_4h_to_h(intermediate_parallel)
+
+        return output
+
+
+class SelfAttention(torch.nn.Module):
+    """self-attention layer abstract class.
+
+    Self-attention layer takes input with size [b, s, h]
+    and returns output of the same size.
+    """
+
+    def __init__(
+        self,
+        hidden_size,
+        num_attention_heads,
+        layer_number,
+        fp16=True,
+        attention_softmax_in_fp32=True,
+    ):
+        super(SelfAttention, self).__init__()
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+        self.fp16 = fp16
+        self.attention_softmax_in_fp32 = attention_softmax_in_fp32
+        self.layer_number = max(1, layer_number)
+
+        assert self.hidden_size % self.num_attention_heads == 0
+        self.hidden_size_per_attention_head = int(self.hidden_size
+                                                  // self.num_attention_heads)
+
+        self.query = torch.nn.Linear(self.hidden_size, self.hidden_size)
+        self.key = torch.nn.Linear(self.hidden_size, self.hidden_size)
+        self.value = torch.nn.Linear(self.hidden_size, self.hidden_size)
+
+        self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
+        self.softmax = torch.nn.Softmax(dim=-1)
+
+        self.dense = torch.nn.Linear(self.hidden_size, self.hidden_size)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        layer_past=None,
+        get_key_value=False,
+        prompt_length=None,
+        context_length=None,
+    ):
+        # hidden_states: [sq, b, h]
+
+        # =====================
+        # Query, Key, and Value
+        # =====================
+
+        query_layer = self.query(hidden_states)
+        key_layer = self.key(hidden_states)
+        value_layer = self.value(hidden_states)
+
+        new_query_layer_shape = query_layer.size()[:-1] + (
+            self.num_attention_heads, self.hidden_size_per_attention_head
+        )  # noqa
+        query_layer = query_layer.view(*new_query_layer_shape)
+
+        new_query_layer_shape = key_layer.size()[:-1] + (
+            self.num_attention_heads, self.hidden_size_per_attention_head)
+        key_layer = key_layer.view(*new_query_layer_shape)
+
+        new_query_layer_shape = value_layer.size()[:-1] + (
+            self.num_attention_heads, self.hidden_size_per_attention_head
+        )  # noqa
+        value_layer = value_layer.view(*new_query_layer_shape)
+
+        # ==================================
+        # Adjust key and value for inference
+        # ==================================
+
+        if layer_past is not None:
+            past_key, past_value = layer_past
+            key_layer = torch.cat((past_key.type_as(key_layer), key_layer),
+                                  dim=0)
+            value_layer = torch.cat(
+                (past_value.type_as(value_layer), value_layer), dim=0)
+        if get_key_value:
+            present = (key_layer, value_layer)
+
+        # ===================================
+        # Raw attention scores. [b, np, sq, sk]
+        # ===================================
+
+        # [b, np, sq, sk]
+        output_size = (query_layer.size(1), query_layer.size(2),
+                       query_layer.size(0), key_layer.size(0))
+
+        # [sq, b, np, hn] -> [sq, b * np, hn]
+        query_layer = query_layer.contiguous().view(
+            output_size[2], output_size[0] * output_size[1], -1)
+        key_layer = key_layer.contiguous().view(
+            output_size[3], output_size[0] * output_size[1], -1)
+
+        # Raw attention scores. [b * np, sq, sk]
+        matmul_result = torch.matmul(
+            query_layer.transpose(0, 1),
+            key_layer.transpose(0, 1).transpose(1, 2)) / self.norm_factor
+
+        # change view to [b, np, sq, sk]
+        attention_scores = matmul_result.view(*output_size)
+
+        # ==================================================
+        # Update attention mask for inference. [b, np, sq, sk]
+        # ==================================================
+
+        if get_key_value:
+            with torch.no_grad():
+                if layer_past is not None:
+                    attention_mask = attention_mask[
+                        ...,
+                        attention_scores.size(3)
+                        - 1, :attention_scores.size(3)].unsqueeze(2)
+                else:
+                    attention_mask = attention_mask[
+                        ..., :attention_scores.size(3), :attention_scores.
+                        size(3)]
+
+        if context_length is not None:
+            attention_mask = torch.clone(attention_mask)
+            attention_mask[:, :, context_length:, :] = True
+
+        # attention scores and attention mask [b, np, sq, sk]
+        # attention_scores = attention_mask_func(attention_scores, attention_mask)
+        attention_scores = attention_scores - attention_mask * 10000.0
+        if self.attention_softmax_in_fp32:
+            attention_probs = self.softmax(attention_scores.float()).half()
+        else:
+            attention_probs = self.softmax(attention_scores)
+
+        # =========================
+        # Context layer. [sq, b, hp]
+        # =========================
+
+        # value_layer -> context layer.
+        # [sq, b, np, hn] --> [b, np, sq, hn]
+
+        # context layer shape: [b, np, sq, hn]
+        output_size = (value_layer.size(1), value_layer.size(2),
+                       query_layer.size(0), value_layer.size(3))
+
+        # change view [sq, b * np, hn]
+        value_layer = value_layer.view(
+            value_layer.size(0), output_size[0] * output_size[1], -1)
+
+        # change view [b * np, sq, sk]
+        attention_probs = attention_probs.view(output_size[0] * output_size[1],
+                                               output_size[2], -1)
+
+        context_layer = torch.bmm(
+            attention_probs,
+            value_layer.unsqueeze(0).transpose(1, 2).squeeze(0))
+
+        # change view [b, np, sq, hn]
+        context_layer = context_layer.view(*output_size)
+
+        # # [b, np, sq, hn] --> [sq, b, np, hn]
+        context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
+
+        # # [sq, b, np, hn] --> [sq, b, hp]
+        new_context_layer_shape = context_layer.size()[:-2] + (
+            self.hidden_size, )
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        # =================
+        # Output. [sq, b, h]
+        # =================
+
+        output = self.dense(context_layer)
+
+        if get_key_value:
+            output = [output, present]
+
+        return output
+
+
+class TopQuerySelfAttention(torch.nn.Module):
+    """Top query self-attention layer abstract class.
+
+    Self-attention layer takes input with size [b, s, h]
+    and returns output of the same size.
+    """
+
+    def __init__(
+        self,
+        hidden_size,
+        num_attention_heads,
+        layer_number,
+        fp16=True,
+        attention_softmax_in_fp32=True,
+    ):
+        super(TopQuerySelfAttention, self).__init__()
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+        self.fp16 = fp16
+        self.attention_softmax_in_fp32 = attention_softmax_in_fp32
+        self.layer_number = max(1, layer_number)
+
+        assert self.hidden_size % self.num_attention_heads == 0
+        self.hidden_size_per_attention_head = int(self.hidden_size
+                                                  // self.num_attention_heads)
+
+        self.query = torch.nn.Linear(self.hidden_size, self.hidden_size)
+        self.key = torch.nn.Linear(self.hidden_size, self.hidden_size)
+        self.value = torch.nn.Linear(self.hidden_size, self.hidden_size)
+
+        self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
+        self.softmax = torch.nn.Softmax(dim=-1)
+
+        self.dense = torch.nn.Linear(self.hidden_size, self.hidden_size)
+
+    def forward(
+        self,
+        hidden_states,
+        query_hidden_state,
+        attention_mask,
+        layer_past=None,
+        get_key_value=False,
+        prompt_length=None,
+        context_length=None,
+    ):
+
+        # hidden_states: [sq, b, h]
+        query_layer = self.query(query_hidden_state)
+        key_layer = self.key(hidden_states)
+        value_layer = self.value(hidden_states)
+
+        new_query_layer_shape = query_layer.size()[:-1] + (
+            self.num_attention_heads, self.hidden_size_per_attention_head
+        )  # noqa
+        query_layer = query_layer.view(*new_query_layer_shape)
+
+        new_query_layer_shape = key_layer.size()[:-1] + (
+            self.num_attention_heads, self.hidden_size_per_attention_head)
+        key_layer = key_layer.view(*new_query_layer_shape)
+
+        new_query_layer_shape = value_layer.size()[:-1] + (
+            self.num_attention_heads, self.hidden_size_per_attention_head
+        )  # noqa
+        value_layer = value_layer.view(*new_query_layer_shape)
+
+        # ==================================
+        # Adjust key and value for inference
+        # ==================================
+
+        if layer_past is not None:
+            past_key, past_value = layer_past
+            key_layer = torch.cat((past_key.type_as(key_layer), key_layer),
+                                  dim=0)
+            value_layer = torch.cat(
+                (past_value.type_as(value_layer), value_layer), dim=0)
+        if get_key_value:
+            present = (key_layer, value_layer)
+
+        # ===================================
+        # Raw attention scores. [b, np, sq, sk]
+        # ===================================
+
+        # [b, np, sq, sk]
+        output_size = (query_layer.size(1), query_layer.size(2),
+                       query_layer.size(0), key_layer.size(0))
+
+        # [s, b, np, hn] -> [s, b * np, hn]
+        query_layer = query_layer.contiguous().view(
+            output_size[2], output_size[0] * output_size[1], -1)
+        key_layer = key_layer.contiguous().view(
+            output_size[3], output_size[0] * output_size[1], -1)
+
+        # Raw attention scores. [b * np, sq, sk]
+        matmul_result = torch.matmul(
+            query_layer.transpose(0, 1),
+            key_layer.transpose(0, 1).transpose(1, 2)) / self.norm_factor
+
+        # change view to [b, np, s, s]
+        attention_scores = matmul_result.view(*output_size)
+
+        # ==================================================
+        # Update attention mask for inference. [b, np, sq, sk]
+        # ==================================================
+
+        if get_key_value:
+            with torch.no_grad():
+                if layer_past is not None:
+                    attention_mask = attention_mask[
+                        ...,
+                        attention_scores.size(3)
+                        - 1, :attention_scores.size(3)].unsqueeze(2)
+                else:
+                    attention_mask = attention_mask[
+                        ..., :attention_scores.size(3), :attention_scores.
+                        size(3)]
+
+        if context_length is not None:
+            attention_mask = torch.clone(attention_mask)
+            attention_mask[:, :, context_length:, :] = True
+
+        # attention scores and attention mask [b, np, sq, sk]
+        # attention_scores = attention_mask_func(attention_scores, attention_mask)
+        attention_scores = attention_scores - attention_mask * 10000.0
+        if self.attention_softmax_in_fp32:
+            attention_probs = self.softmax(attention_scores.float()).half()
+        else:
+            attention_probs = self.softmax(attention_scores)
+
+        # =========================
+        # Context layer. [sq, b, hp]
+        # =========================
+
+        # value_layer -> context layer.
+        # [sq, b, np, hn] --> [b, np, sq, hn]
+
+        # context layer shape: [b, np, sq, hn]
+        output_size = (value_layer.size(1), value_layer.size(2),
+                       query_layer.size(0), value_layer.size(3))
+
+        # change view [sq, b * np, hn]
+        value_layer = value_layer.view(
+            value_layer.size(0), output_size[0] * output_size[1], -1)
+
+        # change view [b * np, sq, sk]
+        attention_probs = attention_probs.view(output_size[0] * output_size[1],
+                                               output_size[2], -1)
+
+        # matmul: [b * np, sq, hn]
+        context_layer = torch.bmm(
+            attention_probs,
+            value_layer.unsqueeze(0).transpose(1, 2).squeeze(0))
+
+        # change view [b, np, sq, hn]
+        context_layer = context_layer.view(*output_size)
+
+        # [b, np, sq, hn] --> [sq, b, np, hn]
+        context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
+
+        # [sq, b, np, hn] --> [sq, b, hp]
+        new_context_layer_shape = context_layer.size()[:-2] + \
+                                  (self.hidden_size,) # noqa
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        # =================
+        # Output. [sq, b, h]
+        # =================
+
+        output = self.dense(context_layer)
+
+        if get_key_value:
+            output = [output, present]
+
+        return output
+
+
+class TransformerLayer(torch.nn.Module):
+    """A single transformer layer.
+
+    Transformore layer takes input with size [b, s, h] and returns an
+    output of the same size.
+    """
+
+    def __init__(
+        self,
+        hidden_size,
+        num_attention_heads,
+        layer_number,
+        layernorm_epsilon=1e-5,
+        fp16=True,
+        attention_softmax_in_fp32=True,
+    ):
+        super(TransformerLayer, self).__init__()
+        self.hidden_size = hidden_size
+        self.layernorm_epsilon = layernorm_epsilon
+        self.layer_number = layer_number
+
+        # Layernorm on the input data.
+        self.input_layernorm = torch.nn.LayerNorm(
+            hidden_size, eps=self.layernorm_epsilon)
+
+        # Self attention.
+        self.attention = SelfAttention(hidden_size, num_attention_heads,
+                                       layer_number, fp16,
+                                       attention_softmax_in_fp32)
+
+        # Layernorm on the input data.
+        self.post_attention_layernorm = torch.nn.LayerNorm(
+            self.hidden_size, eps=self.layernorm_epsilon)
+        self.mlp = MLP(self.hidden_size)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        layer_past=None,
+        get_key_value=False,
+        prompt_length=None,
+        context_length=None,
+    ):
+        # hidden_states: [b, s, h]
+        # Use FP32 for Layernorm
+        # layernorm_output = self.input_layernorm(hidden_states.float()).half()
+        layernorm_output = self.input_layernorm(hidden_states)
+
+        # Self attention.
+        attention_output = self.attention(
+            layernorm_output,
+            attention_mask,
+            layer_past=layer_past,
+            get_key_value=get_key_value,
+            prompt_length=prompt_length,
+            context_length=context_length)
+
+        if get_key_value:
+            attention_output, presents = attention_output
+
+        # Residual connection.
+        residual = hidden_states
+        layernorm_input = attention_output + residual
+
+        # Use FP32 for Layernorm
+        # layernorm_output = self.post_attention_layernorm(layernorm_input.float()).half()
+        layernorm_output = self.post_attention_layernorm(layernorm_input)
+        mlp_output = self.mlp(layernorm_output)
+        output = mlp_output + layernorm_input
+
+        if get_key_value:
+            output = [output, presents]
+
+        return output
+
+
+class TopQueryLayer(torch.nn.Module):
+    """A single top query layer.
+
+    Top query layer takes input with size [b, s, h] and returns an
+    output of the same size.
+    """
+
+    def __init__(
+        self,
+        hidden_size,
+        num_attention_heads,
+        layer_number,
+        layernorm_epsilon=1e-5,
+    ):
+        super(TopQueryLayer, self).__init__()
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+        self.layernorm_epsilon = layernorm_epsilon
+        self.layer_number = layer_number
+
+        # Use FP32 for Layernorm
+        self.input_layernorm = torch.nn.LayerNorm(
+            self.hidden_size, eps=self.layernorm_epsilon)
+
+        # Self attention.
+        self.attention = TopQuerySelfAttention(self.hidden_size,
+                                               self.num_attention_heads,
+                                               self.layer_number)
+        # Layernorm on the input data.
+        self.post_attention_layernorm = torch.nn.LayerNorm(
+            self.hidden_size, eps=self.layernorm_epsilon)
+
+        # MLP
+        self.mlp = MLP(self.hidden_size)
+
+    def forward(
+        self,
+        hidden_states,
+        query_hidden_state,
+        attention_mask,
+        layer_past=None,
+        get_key_value=False,
+        prompt_length=None,
+        context_length=None,
+    ):
+        # hidden_states: [b, s, h]
+        assert query_hidden_state != None  # noqa
+
+        # Use FP32 for Layernorm
+        # layernorm_output = self.input_layernorm(hidden_states.float()).half()
+        layernorm_output = self.input_layernorm(hidden_states)
+
+        # Self attention.
+        attention_output = self.attention(
+            layernorm_output,
+            query_hidden_state,
+            attention_mask,
+            layer_past=layer_past,
+            get_key_value=get_key_value,
+            prompt_length=prompt_length,
+            context_length=context_length)
+
+        if get_key_value:
+            attention_output, presents = attention_output
+
+        # Residual connection.
+        residual = hidden_states
+        layernorm_input = attention_output + residual
+
+        # Use FP32 for Layernorm
+        # layernorm_output = self.post_attention_layernorm(layernorm_input.float()).half()
+        layernorm_output = self.post_attention_layernorm(layernorm_input)
+
+        # MLP.
+        mlp_output = self.mlp(layernorm_output)
+
+        # Second residual connection.
+        residual = layernorm_input
+        output = mlp_output + residual
+
+        if get_key_value:
+            output = [output, presents]
+
+        return output
+
+
+class Transformer(torch.nn.Module):
+    """Transformer class."""
+
+    def __init__(
+        self,
+        hidden_size,
+        num_attention_heads,
+        num_layers,
+        layernorm_epsilon=1e-5,
+    ):
+        super(Transformer, self).__init__()
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+        self.layernorm_epsilon = layernorm_epsilon
+        # Number of layers:
+        self.num_layers = num_layers
+        self.num_unique_layers = None
+
+        #################
+        assert self.num_unique_layers is None
+        #################
+
+        if self.num_unique_layers is None:
+            self.num_unique_layers = self.num_layers
+        assert self.num_layers % self.num_unique_layers == 0, \
+            'number of layers should be divisible by number of unique layers'
+
+        # Transformer layers.
+        def build_layer(layer_number):
+            return TransformerLayer(self.hidden_size, self.num_attention_heads,
+                                    layer_number)
+
+        self.layers = torch.nn.ModuleList(
+            [build_layer(i + 1) for i in range(self.num_unique_layers)])
+
+        self.topQueryLayer = TopQueryLayer(self.hidden_size,
+                                           self.num_attention_heads,
+                                           self.num_unique_layers)
+
+        self.final_layernorm = torch.nn.LayerNorm(
+            self.hidden_size, eps=self.layernorm_epsilon)
+
+    def _get_layer_index(self, layer_number):
+        return layer_number % self.num_unique_layers
+
+    def _get_layer(self, layer_number):
+        return self.layers[self._get_layer_index(layer_number)]
+
+    def forward(
+        self,
+        hidden_states,
+        query_hidden_state,
+        attention_mask,
+        layer_past=None,
+        get_key_value=False,
+        prompt_length=None,
+        context_length=None,
+    ):
+        # data format change to avoid explicit tranposes : [b s h] --> [s b h]
+        hidden_states = hidden_states.transpose(0, 1).contiguous()
+        query_hidden_state = query_hidden_state.transpose(0, 1).contiguous()
+
+        if get_key_value:
+            presents = []
+        for index in range(self.num_layers):
+            layer = self._get_layer(index)
+            past = None
+            if layer_past is not None:
+                past = layer_past[index]
+            hidden_states = layer(
+                hidden_states,
+                attention_mask,
+                layer_past=past,
+                get_key_value=get_key_value,
+                prompt_length=prompt_length,
+                context_length=context_length)
+            if get_key_value:
+                hidden_states, present = hidden_states
+                presents.append(present)
+
+        # Use FP32 for Layernorm
+        # hidden_states_ = self.final_layernorm(hidden_states.float()).half()
+        hidden_states_ = self.final_layernorm(hidden_states)
+
+        #################################
+        # top query layer
+        #################################
+        past = None
+        if layer_past is not None:
+            past = layer_past[self.num_layers]
+        hidden_states = self.topQueryLayer(
+            hidden_states_,
+            query_hidden_state,
+            attention_mask,
+            layer_past=past,
+            get_key_value=get_key_value,
+            prompt_length=prompt_length,
+            context_length=context_length)
+
+        if get_key_value:
+            hidden_states, present = hidden_states
+            presents.append(present)
+
+        # reverting data format change [s b h] --> [b s h]
+        output = hidden_states.transpose(0, 1).contiguous()
+
+        if get_key_value:
+            output = [output, presents]
+
+        return output
+
+    def state_dict_for_save_checkpoint(self,
+                                       destination=None,
+                                       prefix='',
+                                       keep_vars=False):
+        return self.state_dict(destination, prefix, keep_vars)
+
+
+class Embedding(torch.nn.Module):
+    """Language model embeddings.
+
+    Arguments:
+        hidden_size: hidden size
+        vocab_size: vocabulary size
+        max_sequence_length: maximum size of sequence. This
+                             is used for positional embedding
+    """
+
+    def __init__(
+        self,
+        hidden_size,
+        vocab_size,
+        max_sequence_length,
+    ):
+        super(Embedding, self).__init__()
+        self.hidden_size = hidden_size
+        self.vocab_size = vocab_size
+        self.max_sequence_length = max_sequence_length
+
+        # Word embeddings.
+        self.word_embeddings = torch.nn.Embedding(self.vocab_size,
+                                                  self.hidden_size)
+        self._word_embeddings_key = 'word_embeddings'
+
+        # Position embedding.
+        self.position_embeddings = torch.nn.Embedding(self.max_sequence_length,
+                                                      self.hidden_size)
+        self.position_embeddings = self.position_embeddings.half()
+        self._position_embeddings_key = 'position_embeddings'
+
+    def forward(self, input_ids, position_ids):
+        # Embeddings.
+        words_embeddings = self.word_embeddings(input_ids)
+        position_embeddings = self.position_embeddings(position_ids)
+        embeddings = words_embeddings + position_embeddings
+
+        return embeddings
+
+    def state_dict_for_save_checkpoint(self,
+                                       destination=None,
+                                       prefix='',
+                                       keep_vars=False):
+        """For easy load."""
+
+        state_dict_ = {}
+        state_dict_[self._word_embeddings_key] \
+            = self.word_embeddings.state_dict(destination, prefix, keep_vars)
+        state_dict_[self._position_embeddings_key] \
+            = self.position_embeddings.state_dict(
+            destination, prefix, keep_vars)
+
+        return state_dict_
+
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+
+        # Word embedding.
+        if self._word_embeddings_key in state_dict:
+            state_dict_ = state_dict[self._word_embeddings_key]
+        else:
+            # for backward compatibility.
+            state_dict_ = {}
+            for key in state_dict.keys():
+                if 'word_embeddings' in key:
+                    state_dict_[key.split('word_embeddings.')[1]] \
+                        = state_dict[key]
+        state_dict_['weight'] = state_dict_['weight'][:self.vocab_size]
+        self.word_embeddings.load_state_dict(state_dict_, strict=strict)
+
+        # Position embedding.
+        if self._position_embeddings_key in state_dict:
+            state_dict_ = state_dict[self._position_embeddings_key]
+        else:
+            # for backward compatibility.
+            state_dict_ = {}
+            for key in state_dict.keys():
+                if 'position_embeddings' in key:
+                    state_dict_[key.split('position_embeddings.')[1]] \
+                        = state_dict[key]
+        self.position_embeddings.load_state_dict(state_dict_, strict=strict)
+
+
+class QueryEmbedding(torch.nn.Module):
+    """Language model embeddings.
+
+    Arguments:
+        hidden_size: hidden size
+        vocab_size: vocabulary size
+        max_sequence_length: maximum size of sequence. This
+                             is used for positional embedding
+    """
+
+    def __init__(
+        self,
+        hidden_size,
+        vocab_size,
+        max_sequence_length,
+    ):
+        super(QueryEmbedding, self).__init__()
+
+        self.hidden_size = hidden_size
+        self.vocab_size = vocab_size
+        self.max_sequence_length = max_sequence_length
+
+        # Top query position embedding (serial).
+        self.top_query_embeddings = torch.nn.Embedding(
+            self.max_sequence_length, self.hidden_size)
+        self.top_query_embeddings = self.top_query_embeddings.half()
+        self._top_query_embeddings_key = 'top_query_embeddings'
+
+    def forward(self, position_ids):
+        # Embeddings.
+        embeddings = self.top_query_embeddings(position_ids)
+
+        return embeddings
+
+    def state_dict_for_save_checkpoint(self,
+                                       destination=None,
+                                       prefix='',
+                                       keep_vars=False):
+        """For easy load."""
+
+        state_dict_ = {}
+        state_dict_[self._top_query_embeddings_key] \
+            = self.top_query_embeddings.state_dict(
+            destination, prefix, keep_vars)
+
+        return state_dict_
+
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+
+        # Position embedding.
+        if self._top_query_embeddings_key in state_dict:
+            state_dict_ = state_dict[self._top_query_embeddings_key]
+        else:
+            # for backward compatibility.
+            state_dict_ = {}
+            for key in state_dict.keys():
+                if 'top_query_embeddings' in key:
+                    state_dict_[key.split('top_query_embeddings.')[1]] \
+                        = state_dict[key]
+        self.top_query_embeddings.load_state_dict(state_dict_, strict=strict)
+
+
+class TransformerLanguageModel(torch.nn.Module):
+    """Transformer language model.
+
+    Arguments:
+        transformer_hparams: transformer hyperparameters
+        attention_mask_func: a function that takes `unmaksed-attention-scores`
+            with size [b, np, s, s] and an `attention-mask` and will apply
+            the masking. The function should return a masked score of the
+            same size [b, np, s, s].
+          masked-attention-scores = attention_mask_func(
+                                     unmaksed-attention-scores, attention-mask)
+        vocab_size: vocabulary size
+        max_sequence_length: maximum size of sequence. This
+                             is used for positional embedding
+    """
+
+    def __init__(
+        self,
+        hidden_size,
+        num_layers,
+        num_attention_heads,
+        padded_vocab_size,
+        max_position_embeddings,
+    ):
+        super(TransformerLanguageModel, self).__init__()
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.num_attention_heads = num_attention_heads
+        self.padded_vocab_size = padded_vocab_size
+        self.max_position_embeddings = max_position_embeddings
+
+        # Embeddings
+        self.embedding = Embedding(self.hidden_size, self.padded_vocab_size,
+                                   self.max_position_embeddings)
+        self._embedding_key = 'embedding'
+
+        # Query embeddings
+        self.topQueryEmbedding = QueryEmbedding(self.hidden_size,
+                                                self.padded_vocab_size,
+                                                self.max_position_embeddings)
+        self._topQueryEmbedding_key = 'topQueryEmbedding'
+
+        # Transformer
+        self.transformer = Transformer(self.hidden_size,
+                                       self.num_attention_heads,
+                                       self.num_layers)
+        self._transformer_key = 'transformer'
+
+    def forward(
+        self,
+        input_ids,
+        position_ids,
+        attention_mask,
+        layer_past=None,
+        get_key_value=False,
+        prompt_length=None,
+        context_length=None,
+    ):
+
+        # Embeddings.
+        embedding_output = self.embedding(input_ids, position_ids)
+        query_position_ids = position_ids
+        queryEmbedding_out = self.topQueryEmbedding(query_position_ids)
+
+        # Transformer.
+        transformer_output = self.transformer(
+            embedding_output,
+            queryEmbedding_out,
+            attention_mask,
+            layer_past=layer_past,
+            get_key_value=get_key_value,
+            prompt_length=prompt_length,
+            context_length=context_length)
+
+        return transformer_output
+
+    def state_dict_for_save_checkpoint(self,
+                                       destination=None,
+                                       prefix='',
+                                       keep_vars=False):
+        """For easy load."""
+
+        state_dict_ = {}
+        state_dict_[self._embedding_key] \
+            = self.embedding.state_dict_for_save_checkpoint(
+            destination, prefix, keep_vars)
+        state_dict_[self._topQueryEmbedding_key] \
+            = self.topQueryEmbedding.state_dict_for_save_checkpoint(
+            destination, prefix, keep_vars)
+        state_dict_[self._transformer_key] \
+            = self.transformer.state_dict_for_save_checkpoint(
+            destination, prefix, keep_vars)
+
+        return state_dict_
+
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+
+        # Embedding.
+        if self._embedding_key in state_dict:
+            state_dict_ = state_dict[self._embedding_key]
+        else:
+            # for backward compatibility.
+            state_dict_ = {}
+            for key in state_dict.keys():
+                if '_embeddings' in key:
+                    state_dict_[key] = state_dict[key]
+        self.embedding.load_state_dict(state_dict_, strict=strict)
+
+        if self._topQueryEmbedding_key in state_dict:
+            state_dict_ = state_dict[self._topQueryEmbedding_key]
+        else:
+            # for backward compatibility.
+            state_dict_ = {}
+            for key in state_dict.keys():
+                if '_embeddings' in key:
+                    state_dict_[key] = state_dict[key]
+        self.topQueryEmbedding.load_state_dict(state_dict_, strict=strict)
+
+        # Transformer.
+        if self._transformer_key in state_dict:
+            state_dict_ = state_dict[self._transformer_key]
+        else:
+            # for backward compatibility.
+            state_dict_ = {}
+            for key in state_dict.keys():
+                if 'transformer.' in key:
+                    state_dict_[key.split('transformer.')[1]] = state_dict[key]
+        self.transformer.load_state_dict(state_dict_, strict=strict)
+
+
+class CodeGeeXModel(torch.nn.Module):
+    """CodeGeeX: A Multilingual Code Generation Model."""
+
+    def __init__(
+        self,
+        hidden_size,
+        num_layers,
+        num_attention_heads,
+        padded_vocab_size,
+        max_position_embeddings,
+    ):
+        super(CodeGeeXModel, self).__init__()
+
+        self.language_model = TransformerLanguageModel(
+            hidden_size, num_layers, num_attention_heads, padded_vocab_size,
+            max_position_embeddings)
+        self._language_model_key = 'language_model'
+
+    def forward(
+        self,
+        input_ids,
+        position_ids,
+        attention_mask,
+        layer_past=None,
+        get_key_value=False,
+        prompt_length=None,
+        context_length=None,
+    ):
+        # Language model.
+        lm_output = self.language_model(
+            input_ids,
+            position_ids,
+            attention_mask,
+            layer_past=layer_past,
+            get_key_value=get_key_value,
+            prompt_length=prompt_length,
+            context_length=context_length)
+
+        if get_key_value:
+            lm_output, presents = lm_output
+
+        output = F.linear(
+            lm_output,
+            self.language_model.embedding.word_embeddings.weight.half())
+
+        if get_key_value:
+            output = [output, presents]
+
+        return output
+
+    def state_dict_for_save_checkpoint(self,
+                                       destination=None,
+                                       prefix='',
+                                       keep_vars=False):
+
+        state_dict_ = {}
+        state_dict_[self._language_model_key] \
+            = self.language_model.state_dict_for_save_checkpoint(
+            destination, prefix, keep_vars)
+        return state_dict_
+
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+
+        if self._language_model_key in state_dict:
+            state_dict = state_dict[self._language_model_key]
+        self.language_model.load_state_dict(state_dict, strict=strict)
diff --git a/modelscope/models/nlp/codegeex/codegeex_for_code_translation.py b/modelscope/models/nlp/codegeex/codegeex_for_code_translation.py
new file mode 100755
index 00000000..0e9d161b
--- /dev/null
+++ b/modelscope/models/nlp/codegeex/codegeex_for_code_translation.py
@@ -0,0 +1,126 @@
+# Copyright (c) 2022 Zhipu.AI
+
+import copy
+import os
+import random
+import time
+from typing import Dict
+
+import numpy as np
+import torch
+from IPython import embed
+
+from modelscope.metainfo import Models
+from modelscope.models.base import Tensor, TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import ModelFile, Tasks
+from .codegeex import CodeGeeXModel
+from .inference import get_token_stream
+from .tokenizer import CodeGeeXTokenizer
+
+
+def model_provider():
+    """Build the model."""
+
+    hidden_size = 5120
+    num_attention_heads = 40
+    num_layers = 39
+    padded_vocab_size = 52224
+    max_position_embeddings = 2048
+
+    model = CodeGeeXModel(hidden_size, num_layers, num_attention_heads,
+                          padded_vocab_size, max_position_embeddings)
+
+    return model
+
+
+@MODELS.register_module(Tasks.code_translation, module_name=Models.codegeex)
+class CodeGeeXForCodeTranslation(TorchModel):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the fast poem model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+        """
+        super().__init__(model_dir, *args, **kwargs)
+
+        # loading tokenizer
+        print('Loading tokenizer ...')
+        self.tokenizer = CodeGeeXTokenizer(
+            tokenizer_path=model_dir + '/tokenizer', mode='codegeex-13b')
+        # loading model
+        state_dict_path = model_dir + '/ckpt_ms_translation_0817.pt'
+        print('Loading state dict ...')
+        state_dict = torch.load(state_dict_path, map_location='cpu')
+        state_dict = state_dict['module']
+
+        print('Building CodeGeeX model ...')
+        self.model = model_provider()
+        self.model.load_state_dict(state_dict)
+        self.model.eval()
+        self.model.half()
+        self.model.cuda()
+
+    def forward(self, input: Dict[str, str]) -> Dict[str, str]:
+        micro_batch_size = 1
+        seq_length = 2048
+        out_seq_length = 256
+        bad_ids = None
+        print('Generating ...')
+        src_lang = input['source language']
+        dst_lang = input['target language']
+        prompt = input['prompt']
+        prompt = f'code translation\n{src_lang}:\n{prompt}\n{dst_lang}:\n'
+        t0 = time.perf_counter()
+        tokenizer = self.tokenizer
+        model = self.model
+        for prompt in [prompt]:
+            tokens = tokenizer.encode_code(prompt)
+            print(tokens)
+            print('Current prompt:')
+            print(prompt)
+            n_token_prompt = len(tokens)
+            print('N_token_prompt:', n_token_prompt)
+            token_stream = get_token_stream(
+                model,
+                tokenizer,
+                seq_length,
+                out_seq_length,
+                [copy.deepcopy(tokens) for _ in range(micro_batch_size)],
+                micro_batch_size=micro_batch_size,
+                bad_ids=bad_ids,
+                greedy=True,
+            )
+            is_finished = [False for _ in range(micro_batch_size)]
+            for i, generated in enumerate(token_stream):
+                generated_tokens = generated[0]
+                for j in range(micro_batch_size):
+                    if is_finished[j]:
+                        continue
+                    if generated_tokens[j].cpu().numpy(
+                    )[-1] == tokenizer.eos_token_id or len(
+                            generated_tokens[j]) >= out_seq_length:
+                        is_finished[j] = True
+                        generated_tokens_ = generated_tokens[j].cpu().numpy(
+                        ).tolist()
+                        generated_code = tokenizer.decode_code(
+                            generated_tokens_[n_token_prompt:])
+                        generated_code = ''.join(generated_code)
+                        t1 = time.perf_counter()
+                        print('Total generation time:', t1 - t0, '# Tokens:',
+                              len(generated_tokens_) - n_token_prompt)
+                        print(
+                            f'{(t1 - t0) / (len(generated_tokens_) - n_token_prompt)}s/token'
+                        )
+                        print(
+                            '================================= Generated code:'
+                        )
+                        print(generated_code)
+                        t0 = time.perf_counter()
+                    if all(is_finished):
+                        break
+
+        print('Generation finished.')
+        return {OutputKeys.TEXT: generated_code}
diff --git a/modelscope/models/nlp/codegeex/inference.py b/modelscope/models/nlp/codegeex/inference.py
new file mode 100755
index 00000000..76a9458b
--- /dev/null
+++ b/modelscope/models/nlp/codegeex/inference.py
@@ -0,0 +1,335 @@
+import copy
+import os
+import time
+import typing
+from dataclasses import dataclass
+
+import json
+import torch
+import torch.nn.functional as F
+
+
+def get_ltor_masks_and_position_ids(
+    data,
+    eod_token,
+    reset_position_ids,
+    reset_attention_mask,
+):
+    """Build masks and position id for left to right model."""
+
+    # Extract batch size and sequence length.
+    micro_batch_size, seq_length = data.size()
+
+    # Attention mask (lower triangular).
+    if reset_attention_mask:
+        att_mask_batch = micro_batch_size
+    else:
+        att_mask_batch = 1
+    attention_mask = torch.tril(
+        torch.ones((att_mask_batch, seq_length, seq_length),
+                   device=data.device)).view(att_mask_batch, 1, seq_length,
+                                             seq_length)
+
+    # Position ids.
+    position_ids = torch.arange(
+        seq_length, dtype=torch.long, device=data.device)
+    position_ids = position_ids.unsqueeze(0).expand_as(data)
+    # We need to clone as the ids will be modifed based on batch index.
+    if reset_position_ids:
+        position_ids = position_ids.clone()
+
+    if reset_position_ids or reset_attention_mask:
+        # Loop through the batches:
+        for b in range(micro_batch_size):
+
+            # Find indecies where EOD token is.
+            eod_index = position_ids[b, data[b] == eod_token]
+            # Detach indecies from positions if going to modify positions.
+            if reset_position_ids:
+                eod_index = eod_index.clone()
+
+            # Loop through EOD indecies:
+            prev_index = 0
+            for j in range(eod_index.size()[0]):
+                i = eod_index[j]
+                # Mask attention loss.
+                if reset_attention_mask:
+                    attention_mask[b, 0, (i + 1):, :(i + 1)] = 0
+                # Reset positions.
+                if reset_position_ids:
+                    position_ids[b, (i + 1):] -= i + 1 - prev_index
+                    prev_index = i + 1
+
+    # Convert attention mask to binary:
+    attention_mask = attention_mask < 0.5
+
+    return attention_mask, position_ids
+
+
+def get_batch(
+    context_tokens,
+    micro_batch_size,
+    eod_token,
+    reset_position_ids=False,
+    reset_attention_mask=False,
+):
+    """Generate batch from context tokens."""
+    tokens = context_tokens.view(micro_batch_size, -1).contiguous().cuda()
+    # Get the attention mask and postition ids.
+    attention_mask, position_ids = get_ltor_masks_and_position_ids(
+        tokens,
+        eod_token,
+        reset_position_ids,
+        reset_attention_mask,
+    )
+
+    return tokens, attention_mask, position_ids
+
+
+def top_k_logits(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
+    """This function has been mostly taken from huggingface conversational
+    ai code at
+        https://medium.com/huggingface/how-to-build-a-state-of-the-art-
+             conversational-ai-with-transfer-learning-2d818ac26313"""
+
+    if top_k > 0:
+        # Remove all tokens with a probability less than the
+        # last token of the top-k
+        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1,
+                                                                  None]
+        logits[indices_to_remove] = filter_value
+
+    if top_p > 0.0:
+        # Cconvert to 1D
+        sorted_logits, sorted_indices = torch.sort(
+            logits, descending=True, dim=-1)
+        cumulative_probs = torch.cumsum(
+            F.softmax(sorted_logits, dim=-1), dim=-1)
+
+        # Remove tokens with cumulative probability above the threshold
+        sorted_indices_to_remove = cumulative_probs > top_p
+        # Shift the indices to the right to keep also the first token
+        # above the threshold
+        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
+            ..., :-1].clone()
+        sorted_indices_to_remove[..., 0] = 0
+        for i in range(sorted_indices.size(0)):
+            indices_to_remove = sorted_indices[i][sorted_indices_to_remove[i]]
+            logits[i][indices_to_remove] = filter_value
+
+    return logits
+
+
+def pad_batch(batch, pad_id, seq_length):
+    context_lengths = []
+    for tokens in batch:
+        context_length = len(tokens)
+        if context_length < seq_length:
+            tokens.extend([pad_id] * (seq_length - context_length))
+        context_lengths.append(context_length)
+    return batch, context_lengths
+
+
+def forward_step(
+    model,
+    tokens,
+    seq_length,
+    position_ids,
+    attention_mask,
+    layer_past=None,
+    get_key_value=None,
+    prompt_length=None,
+    context_length=None,
+):
+    # Forward pass through the model.
+    output_tensor = model(
+        tokens,
+        position_ids,
+        attention_mask,
+        layer_past=layer_past,
+        get_key_value=get_key_value,
+        prompt_length=prompt_length,
+        context_length=context_length,
+    )
+
+    if get_key_value:
+        output_tensor, layer_past = output_tensor
+
+    if get_key_value:
+        return output_tensor, layer_past
+
+    return output_tensor
+
+
+def get_token_stream(
+    model,
+    tokenizer,
+    seq_length,
+    out_seq_length,
+    context_tokens,
+    return_scores: bool = False,
+    prompt_length: int = None,
+    micro_batch_size: int = None,
+    bad_ids: List = None,
+    temperature: float = 1.0,
+    topp: float = 1.0,
+    topk: int = 0.0,
+    greedy: bool = False,
+):
+    context_tokens, context_lengths = pad_batch(context_tokens,
+                                                tokenizer.eos_token_id,
+                                                seq_length)
+
+    context_tokens_tensor = torch.cuda.LongTensor(context_tokens)
+    context_length_tensor = torch.cuda.LongTensor(context_lengths)
+    context_length = context_length_tensor.min().item()
+    tokens, attention_mask, position_ids = get_batch(
+        context_tokens_tensor,
+        micro_batch_size,
+        tokenizer.eos_token_id,
+    )
+
+    batch_token_iterator = sample_sequence_batch(
+        model,
+        tokenizer,
+        context_tokens_tensor,
+        context_length_tensor,
+        attention_mask,
+        position_ids,
+        seq_length=seq_length,
+        out_seq_length=out_seq_length,
+        return_scores=return_scores,
+        prompt_length=prompt_length,
+        bad_ids=bad_ids,
+        temperature=temperature,
+        topp=topp,
+        topk=topk,
+        greedy=greedy,
+    )
+
+    for tokens, lengths in batch_token_iterator:
+        context_length += 1
+        if tokens is not None:
+            yield tokens[:, :context_length], lengths
+        else:
+            yield None, None
+
+
+def switch(val1, val2, boolean):
+    boolean = boolean.type_as(val1)
+    return (1 - boolean) * val1 + boolean * val2
+
+
+def sample_sequence_batch(
+    model,
+    tokenizer,
+    context_tokens,
+    context_lengths,
+    attention_mask,
+    position_ids,
+    seq_length,
+    out_seq_length,
+    maxlen=None,
+    return_scores: bool = False,
+    prompt_length: int = None,
+    bad_ids: List = None,
+    temperature: float = 1.0,
+    topp: float = 1.0,
+    topk: int = 0.0,
+    recompute: bool = False,
+    greedy: bool = False,
+):
+    model.eval()
+    with torch.no_grad():
+        context_length = context_lengths.min().item()
+        eos_id = tokenizer.eos_token_id
+
+        counter = 0
+        org_context_length = context_length
+
+        layer_past = None
+        batch_size = context_tokens.size(0)
+        is_done = torch.zeros([batch_size]).byte().cuda()
+        tokens = context_tokens
+        if maxlen is None:
+            maxlen = seq_length - 1
+            if maxlen > (org_context_length + out_seq_length):
+                maxlen = org_context_length + out_seq_length
+
+        lengths = torch.ones([batch_size]).long().cuda() * maxlen
+        if return_scores:
+            scores = torch.zeros([batch_size]).float().cuda()
+
+        while context_length <= (maxlen):
+
+            if recompute:
+                logits = model(
+                    tokens,
+                    position_ids,
+                    attention_mask,
+                    prompt_length=prompt_length,
+                    context_length=context_length,
+                )
+                logits = logits[:, context_length - 1, :]
+            else:
+                if counter == 0:
+                    tokens2use = tokens[:, :context_length]
+                    positions2use = position_ids[:, :context_length]
+                else:
+                    tokens2use = tokens[:, context_length - 1].view(
+                        batch_size, -1)
+                    positions2use = position_ids[:, context_length - 1].view(
+                        batch_size, -1)
+                logits, layer_past = model(
+                    tokens2use,
+                    positions2use,
+                    attention_mask,
+                    layer_past=layer_past,
+                    get_key_value=True,
+                    prompt_length=prompt_length,
+                    context_length=context_length,
+                )
+                logits = logits[:, -1].view(batch_size, -1).contiguous()
+
+            if bad_ids is not None:
+                for bad_id in bad_ids:
+                    logits[:, bad_id] = -10000
+            if greedy:
+                prev = torch.argmax(logits, dim=-1).view(-1)
+            else:
+                logits = logits.float()
+                if return_scores:
+                    orig_log_probs = torch.log_softmax(logits, dim=-1)
+                logits /= temperature
+                logits = top_k_logits(logits, top_k=topk, top_p=topp)
+                log_probs = F.softmax(logits, dim=-1)
+                prev = torch.multinomial(log_probs, num_samples=1).view(-1)
+
+            started = context_lengths <= context_length
+
+            new_tokens = switch(tokens[:, context_length].view(-1), prev,
+                                started)
+
+            if not greedy and return_scores:
+                indices = prev.view(-1, 1)
+                new_scores = orig_log_probs.gather(1, indices).view(-1)
+                new_scores = new_scores * started
+                new_scores = new_scores * is_done.bool().logical_not()
+                scores += new_scores
+
+            tokens[:, context_length] = new_tokens
+            done_token = (prev == eos_id).byte() & started.byte()
+            just_finished = (done_token & ~is_done).bool()
+            lengths[just_finished.view(-1)] = context_length
+            is_done = is_done | done_token
+            done = torch.all(is_done)
+
+            if return_scores:
+                yield tokens, (lengths, scores)
+            else:
+                yield tokens, lengths
+
+            context_length += 1
+            counter += 1
+            if done:
+                break
diff --git a/modelscope/models/nlp/codegeex/tokenizer.py b/modelscope/models/nlp/codegeex/tokenizer.py
new file mode 100755
index 00000000..66958d7d
--- /dev/null
+++ b/modelscope/models/nlp/codegeex/tokenizer.py
@@ -0,0 +1,186 @@
+import typing
+
+import torch
+from transformers import AutoTokenizer
+from transformers.models.gpt2 import GPT2TokenizerFast
+
+
+def encode_whitespaces(text, start_extra_id: int, max_len: int):
+    """ Encode whitespaces to extra tokens in GPT-J.
+
+    >>> encode_whitespaces('a\\n  b\\n   c', 10, 10)
+    'a\\n<|extratoken_10|>b\\n<|extratoken_11|>c'
+    """
+
+    def push_acc_space(acc_len: int, text: str):
+        if acc_len == 0:
+            return text
+        if acc_len == 1:
+            return text + ' '
+        assert acc_len <= max_len, f'Max whitespace run length {max_len}, but found {acc_len}'
+        extra_id = start_extra_id - 2 + acc_len
+        extra_token = f'<|extratoken_{extra_id}|>'
+        return text + extra_token
+
+    acc_len = 0
+    res = ''
+    for ch in text:
+        if ch == ' ':
+            acc_len += 1
+            if acc_len == max_len:
+                res = push_acc_space(acc_len, res)
+                acc_len = 0
+        else:
+            res = push_acc_space(acc_len, res)
+            acc_len = 0
+            res = res + ch
+
+    res = push_acc_space(acc_len, res)
+
+    return res
+
+
+def decode_whitespaces(text: str, start_extra_id: int, max_len: int):
+    """ Decode the whitespace-encoded strings produced by encode_whitespace.
+
+    >>> text = 'a\\n  b\\n   c'
+    >>> s, l = 10, 10
+    >>> text == decode_whitespaces(encode_whitespaces(text, s, l), s, l)
+    True
+    """
+    for l in range(2, max_len + 1):  # noqa
+        token_id = start_extra_id - 2 + l
+        token = f'<|extratoken_{token_id}|>'
+        text = text.replace(token, ' ' * l)
+    return text
+
+
+class Code13BDictionary(object):
+
+    def __init__(
+        self,
+        dict_file: str,
+        extra_token_ids: List[str] = None,
+        pad_to_vocab_size: int = -1,
+    ):
+        self._idx = dict()
+        self._count = dict()
+        self._num_symbols = 0
+        self._symbols = []
+
+        self._add_symbol('<s>', 0)
+        self._add_symbol('<pad>', 0)
+        self._add_symbol('</s>', 0)
+        self._add_symbol('<unk>', 0)
+        self._load_dict(dict_file)
+
+        if extra_token_ids is None:
+            extra_token_ids = [str(x) for x in range(50257, 50400)
+                               ]  # follows GPT-J settings
+
+        for token_id in extra_token_ids:
+            self._add_symbol(token_id, 0)
+
+        if pad_to_vocab_size > 0:
+            self._pad_to_vocab_size(pad_to_vocab_size)
+
+    def _pad_to_vocab_size(self, vocab_size: int):
+        num_pad = vocab_size - len(self)
+        if num_pad <= 0:
+            return
+        for i in range(1, num_pad + 1):
+            self._add_symbol('vocab_pad_token{}'.format(i), 0)
+
+    def _load_dict(self, dict_file: str):
+        with open(dict_file, 'r') as f:
+            for line in f:
+                line = line.strip()
+                if line == '' or line.startswith('#'):
+                    continue
+                sym, count = line.split()
+                self._add_symbol(sym, int(count))
+
+    def _add_symbol(self, sym: str, count: int):
+        self._idx[sym] = self._num_symbols
+        self._count[sym] = count
+        self._symbols.append(sym)
+        self._num_symbols += 1
+
+    def __len__(self):
+        return self._num_symbols
+
+    def index(self, sym: str):
+        return self._idx[sym]
+
+    def string(self, idx: int):
+        return self._symbols[idx]
+
+    def map_token(self, token: Union[int, str]):
+        if isinstance(token, int):
+            token = str(token)
+        return self.index(token)
+
+    def map_tokens(self, tokens):
+        return [self.map_token(token) for token in tokens]
+
+    def decode_tokens(self, tokens):
+        decoded = [
+            '50256' if token == 50256 else self.string(token)
+            for token in tokens
+        ]
+        return [int(x) for x in decoded if not x.startswith('vocab_pad_token')]
+
+
+class CodeGeeXTokenizer(object):
+
+    def __init__(
+        self,
+        tokenizer: GPT2TokenizerFast = None,
+        tokenizer_path: str = 'EleutherAI/gpt-j-6B',
+        start_extra_id: int = 10,
+        max_len: int = 10,
+        mode='codegeex-13b',
+        dict_file: str = None,
+    ):
+        self.tokenizer = tokenizer if tokenizer is not None else AutoTokenizer.from_pretrained(
+            tokenizer_path)
+        if mode not in ['codegeex-13b', 'codegeex-python-13b']:
+            raise ValueError(
+                f"Invalid mode {mode}, choose from ['codegeex-13b', 'codegeex-python-13b']"
+            )
+        self.start_extra_id = start_extra_id
+        self.max_len = max_len
+        self.mode = mode
+        if dict_file is not None:
+            self.code_dict = Code13BDictionary(
+                dict_file, pad_to_vocab_size=51200
+            ) if self.mode == 'codegeex-python-13b' else None
+        else:
+            self.code_dict = None
+        self.eos_token_id = self.tokenizer.eos_token_id
+
+    def encode_code(self, code: str):
+        if self.mode == 'codegeex-13b':
+            code = encode_whitespaces(code, self.start_extra_id, self.max_len)
+            input_ids = self.tokenizer(
+                code, is_split_into_words=False).input_ids
+
+        elif self.mode == 'codegeex-python-13b':
+            code = encode_whitespaces(code, self.start_extra_id, self.max_len)
+            input_ids = self.code_dict.map_tokens(self.tokenizer.encode(code))
+            input_ids = torch.LongTensor(input_ids).reshape(1, -1)
+
+        return input_ids
+
+    def decode_code(self, input_ids):
+        if self.mode == 'codegeex-13b':
+            text = self.tokenizer.decode(input_ids, skip_special_tokens=False)
+            output_code = decode_whitespaces(text, self.start_extra_id,
+                                             self.max_len)
+        elif self.mode == 'codegeex-python-13b':
+            input_ids = [self.code_dict.decode_tokens(input_ids.tolist()[0])]
+            text = self.tokenizer.decode(input_ids, skip_special_tokens=False)
+            output_code = decode_whitespaces(text, self.start_extra_id,
+                                             self.max_len)
+
+        return output_code
diff --git a/modelscope/pipelines/nlp/__init__.py b/modelscope/pipelines/nlp/__init__.py
index 1206ae08..3ffe7b93 100644
--- a/modelscope/pipelines/nlp/__init__.py
+++ b/modelscope/pipelines/nlp/__init__.py
@@ -32,6 +32,7 @@ if TYPE_CHECKING:
     from .word_segmentation_pipeline import WordSegmentationPipeline
     from .zero_shot_classification_pipeline import ZeroShotClassificationPipeline
     from .mglm_text_summarization_pipeline import MGLMTextSummarizationPipeline
+    from .codegeex_code_translation_pipeline import CodeGeeXCodeTranslationPipeline
     from .multilingual_word_segmentation_pipeline import MultilingualWordSegmentationPipeline, \
         WordSegmentationThaiPipeline
 
@@ -73,6 +74,8 @@ else:
         'zero_shot_classification_pipeline':
         ['ZeroShotClassificationPipeline'],
         'mglm_text_summarization_pipeline': ['MGLMTextSummarizationPipeline'],
+        'codegeex_code_translation_pipeline':
+        ['CodeGeeXCodeTranslationPipeline'],
         'multilingual_word_segmentation_pipeline': [
             'MultilingualWordSegmentationPipeline',
             'WordSegmentationThaiPipeline'
diff --git a/modelscope/pipelines/nlp/codegeex_code_translation_pipeline.py b/modelscope/pipelines/nlp/codegeex_code_translation_pipeline.py
new file mode 100755
index 00000000..3c7374da
--- /dev/null
+++ b/modelscope/pipelines/nlp/codegeex_code_translation_pipeline.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2022 Zhipu.AI
+
+from typing import Any, Dict, Optional, Union
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.base import Model
+from modelscope.models.nlp import CodeGeeXForCodeTranslation
+from modelscope.pipelines.base import Pipeline, Tensor
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import CodeGeeXPreprocessor, Preprocessor
+from modelscope.utils.constant import Tasks
+
+
+@PIPELINES.register_module(
+    group_key=Tasks.code_translation,
+    module_name=Pipelines.codegeex_code_translation)
+class CodeGeeXCodeTranslationPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[CodeGeeXForCodeTranslation, str],
+                 preprocessor: [Preprocessor] = None,
+                 *args,
+                 **kwargs):
+        model = CodeGeeXForCodeTranslation(model) if isinstance(model,
+                                                                str) else model
+        self.model = model
+        self.model.eval()
+        self.model.half()
+        self.model.cuda()
+        if preprocessor is None:
+            preprocessor = CodeGeeXPreprocessor()
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+
+    # define the forward pass
+    def forward(self, inputs: Union[Dict], **forward_params) -> Dict[str, Any]:
+        # check input format
+        for para in ['prompt', 'source language', 'target language']:
+            if para not in inputs:
+                return ('please check your input format.')
+        return self.model(inputs)
+
+    # format the outputs from pipeline
+    def postprocess(self, input, **kwargs) -> Dict[str, Any]:
+        return input
diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py
index 0db1c7e0..ce053459 100644
--- a/modelscope/preprocessors/__init__.py
+++ b/modelscope/preprocessors/__init__.py
@@ -23,7 +23,7 @@ if TYPE_CHECKING:
         SentenceEmbeddingPreprocessor, SequenceClassificationPreprocessor,
         TokenClassificationPreprocessor, TextErrorCorrectionPreprocessor,
         TextGenerationPreprocessor, Text2TextGenerationPreprocessor, Tokenize,
-        WordSegmentationBlankSetToLabelPreprocessor,
+        WordSegmentationBlankSetToLabelPreprocessor, CodeGeeXPreprocessor,
         MGLMSummarizationPreprocessor, ZeroShotClassificationPreprocessor,
         TextGenerationJiebaPreprocessor, SentencePiecePreprocessor,
         DialogIntentPredictionPreprocessor, DialogModelingPreprocessor,
@@ -57,7 +57,7 @@ else:
             'TextErrorCorrectionPreprocessor', 'TextGenerationPreprocessor',
             'Tokenize', 'Text2TextGenerationPreprocessor',
             'WordSegmentationBlankSetToLabelPreprocessor',
-            'MGLMSummarizationPreprocessor',
+            'MGLMSummarizationPreprocessor', 'CodeGeeXPreprocessor',
             'ZeroShotClassificationPreprocessor',
             'TextGenerationJiebaPreprocessor', 'SentencePiecePreprocessor',
             'NERPreprocessorViet', 'NERPreprocessorThai',
diff --git a/modelscope/preprocessors/nlp/__init__.py b/modelscope/preprocessors/nlp/__init__.py
index 7c48fb3c..2121543a 100644
--- a/modelscope/preprocessors/nlp/__init__.py
+++ b/modelscope/preprocessors/nlp/__init__.py
@@ -30,6 +30,7 @@ if TYPE_CHECKING:
     from .space_T_en import ConversationalTextToSqlPreprocessor
     from .space_T_cn import TableQuestionAnsweringPreprocessor
     from .mglm_summarization_preprocessor import MGLMSummarizationPreprocessor
+    from .codegeex_preprocessor import CodeGeeXPreprocessor
 else:
     _import_structure = {
         'nlp_base': [
@@ -64,6 +65,7 @@ else:
             'TextErrorCorrectionPreprocessor',
         ],
         'mglm_summarization_preprocessor': ['MGLMSummarizationPreprocessor'],
+        'codegeex_preprocessor': ['CodeGeeXPreprocessor'],
         'token_classification_thai_preprocessor': [
             'NERPreprocessorThai',
             'WordSegmentationPreprocessorThai',
diff --git a/modelscope/preprocessors/nlp/codegeex_preprocessor.py b/modelscope/preprocessors/nlp/codegeex_preprocessor.py
new file mode 100755
index 00000000..f5f462f6
--- /dev/null
+++ b/modelscope/preprocessors/nlp/codegeex_preprocessor.py
@@ -0,0 +1,25 @@
+# Copyright (c) 2022 Zhipu.AI
+
+import re
+from typing import Any, Dict, Iterable, Optional, Tuple, Union
+
+from modelscope.metainfo import Models, Preprocessors
+from modelscope.preprocessors.base import Preprocessor
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.utils.constant import Fields, InputFields, ModeKeys, ModelFile
+from modelscope.utils.type_assert import type_assert
+
+
+@PREPROCESSORS.register_module(Fields.nlp, module_name=Preprocessors.codegeex)
+class CodeGeeXPreprocessor(Preprocessor):
+
+    def __init__(self, *args, **kwargs):
+        """preprocess the data
+        Args:
+            model_dir (str): model path
+        """
+        super().__init__(*args, **kwargs)
+
+    @type_assert(object, (str, tuple, Dict))
+    def __call__(self, data: Union[str, tuple, Dict]) -> Dict[str, Any]:
+        return data
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index b1bccc4c..bf3f8fb9 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -120,6 +120,7 @@ class NLPTasks(object):
     fill_mask = 'fill-mask'
     text_summarization = 'text-summarization'
     question_answering = 'question-answering'
+    code_translation = 'code-translation'
     zero_shot_classification = 'zero-shot-classification'
     backbone = 'backbone'
     text_error_correction = 'text-error-correction'
diff --git a/tests/pipelines/test_CodeGeeX_code_translation.py b/tests/pipelines/test_CodeGeeX_code_translation.py
new file mode 100644
index 00000000..d2fd5369
--- /dev/null
+++ b/tests/pipelines/test_CodeGeeX_code_translation.py
@@ -0,0 +1,38 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import unittest
+
+from modelscope.models import Model
+from modelscope.pipelines import pipeline
+from modelscope.preprocessors import CodeGeeXPreprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class CodeGeeXCodeTranslationTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.output_dir = 'unittest_output'
+        os.makedirs(self.output_dir, exist_ok=True)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_CodeGeeX_with_name(self):
+        model = 'ZhipuAI/CodeGeeX-Code-Translation-13B'
+        preprocessor = CodeGeeXPreprocessor()
+        pipe = pipeline(
+            task=Tasks.code_translation,
+            model=model,
+            preprocessor=preprocessor,
+        )
+        inputs = {
+            'prompt': 'for i in range(10):\n\tprint(i)\n',
+            'source language': 'Python',
+            'target language': 'C++'
+        }
+        result = pipe(inputs)
+        print(result)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 2e30caf1e6dfb6a37e39599449583326aef889ae Mon Sep 17 00:00:00 2001
From: pengzhendong <275331498@qq.com>
Date: Wed, 23 Nov 2022 17:29:06 +0800
Subject: [PATCH 19/29] [pipelines] add wenetruntime

---
 modelscope/metainfo.py                        |  2 +
 .../asr/wenet_automatic_speech_recognition.py | 45 ++++++++++
 .../audio/asr_wenet_inference_pipeline.py     | 87 +++++++++++++++++++
 requirements/audio.txt                        |  1 +
 4 files changed, 135 insertions(+)
 create mode 100644 modelscope/models/audio/asr/wenet_automatic_speech_recognition.py
 create mode 100644 modelscope/pipelines/audio/asr_wenet_inference_pipeline.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index ccd36349..b13e7aec 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -92,6 +92,7 @@ class Models(object):
     speech_dfsmn_kws_char_farfield = 'speech_dfsmn_kws_char_farfield'
     kws_kwsbp = 'kws-kwsbp'
     generic_asr = 'generic-asr'
+    wenet_asr = 'wenet-asr'
 
     # multi-modal models
     ofa = 'ofa'
@@ -267,6 +268,7 @@ class Pipelines(object):
     speech_dfsmn_kws_char_farfield = 'speech_dfsmn_kws_char_farfield'
     kws_kwsbp = 'kws-kwsbp'
     asr_inference = 'asr-inference'
+    asr_wenet_inference = 'asr-wenet-inference'
 
     # multi-modal tasks
     image_captioning = 'image-captioning'
diff --git a/modelscope/models/audio/asr/wenet_automatic_speech_recognition.py b/modelscope/models/audio/asr/wenet_automatic_speech_recognition.py
new file mode 100644
index 00000000..7db11190
--- /dev/null
+++ b/modelscope/models/audio/asr/wenet_automatic_speech_recognition.py
@@ -0,0 +1,45 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+from typing import Any, Dict
+
+from modelscope.metainfo import Models
+from modelscope.models.base import Model
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import Tasks
+
+import wenetruntime as wenet
+
+__all__ = ['WeNetAutomaticSpeechRecognition']
+
+
+@MODELS.register_module(
+    Tasks.auto_speech_recognition, module_name=Models.wenet_asr)
+class WeNetAutomaticSpeechRecognition(Model):
+
+    def __init__(self, model_dir: str, am_model_name: str,
+                 model_config: Dict[str, Any], *args, **kwargs):
+        """initialize the info of model.
+
+        Args:
+            model_dir (str): the model path.
+            am_model_name (str): the am model name from configuration.json
+            model_config (Dict[str, Any]): the detail config about model from configuration.json
+        """
+        super().__init__(model_dir, am_model_name, model_config, *args,
+                         **kwargs)
+        self.model_cfg = {
+            # the recognition model dir path
+            'model_dir': model_dir,
+            # the recognition model config dict
+            'model_config': model_config
+        }
+        self.decoder = None
+
+    def forward(self) -> Dict[str, Any]:
+        """preload model and return the info of the model
+        """
+        model_dir = self.model_cfg['model_dir']
+        self.decoder = wenet.Decoder(model_dir, lang='chs')
+
+        return self.model_cfg
diff --git a/modelscope/pipelines/audio/asr_wenet_inference_pipeline.py b/modelscope/pipelines/audio/asr_wenet_inference_pipeline.py
new file mode 100644
index 00000000..33e8c617
--- /dev/null
+++ b/modelscope/pipelines/audio/asr_wenet_inference_pipeline.py
@@ -0,0 +1,87 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict, Union
+
+from modelscope.metainfo import Pipelines
+from modelscope.models import Model
+from modelscope.pipelines.base import Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import WavToScp
+from modelscope.utils.audio.audio_utils import (extract_pcm_from_wav,
+                                                load_bytes_from_url)
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+__all__ = ['WeNetAutomaticSpeechRecognitionPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.auto_speech_recognition, module_name=Pipelines.asr_wenet_inference)
+class WeNetAutomaticSpeechRecognitionPipeline(Pipeline):
+    """ASR Inference Pipeline
+    """
+
+    def __init__(self,
+                 model: Union[Model, str] = None,
+                 preprocessor: WavToScp = None,
+                 **kwargs):
+        """use `model` and `preprocessor` to create an asr pipeline for prediction
+        """
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+        self.model_cfg = self.model.forward()
+        self.decoder = self.model.decoder
+
+    def __call__(self,
+                 audio_in: Union[str, bytes],
+                 audio_fs: int = None,
+                 recog_type: str = None,
+                 audio_format: str = None) -> Dict[str, Any]:
+        from easyasr.common import asr_utils
+
+        self.recog_type = recog_type
+        self.audio_format = audio_format
+        self.audio_fs = audio_fs
+
+        if isinstance(audio_in, str):
+            # load pcm data from url if audio_in is url str
+            self.audio_in, checking_audio_fs = load_bytes_from_url(audio_in)
+        elif isinstance(audio_in, bytes):
+            # load pcm data from wav data if audio_in is wave format
+            self.audio_in, checking_audio_fs = extract_pcm_from_wav(audio_in)
+        else:
+            self.audio_in = audio_in
+
+        # set the sample_rate of audio_in if checking_audio_fs is valid
+        if checking_audio_fs is not None:
+            self.audio_fs = checking_audio_fs
+
+        if recog_type is None or audio_format is None:
+            self.recog_type, self.audio_format, self.audio_in = asr_utils.type_checking(
+                audio_in=self.audio_in,
+                recog_type=recog_type,
+                audio_format=audio_format)
+
+        if hasattr(asr_utils, 'sample_rate_checking'):
+            checking_audio_fs = asr_utils.sample_rate_checking(
+                self.audio_in, self.audio_format)
+            if checking_audio_fs is not None:
+                self.audio_fs = checking_audio_fs
+
+        self.model_cfg['audio'] = self.audio_in
+        self.model_cfg['audio_fs'] = self.audio_fs
+
+        output = self.forward(self.model_cfg)
+        rst = self.postprocess(output['asr_result'])
+        return rst
+
+    def forward(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        """Decoding
+        """
+        inputs['asr_result'] = self.decoder.decode(inputs['audio'])
+        return inputs
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        """process the asr results
+        """
+        return inputs
diff --git a/requirements/audio.txt b/requirements/audio.txt
index bef32121..86c78d3c 100644
--- a/requirements/audio.txt
+++ b/requirements/audio.txt
@@ -25,3 +25,4 @@ torchaudio
 tqdm
 ttsfrd>=0.0.3
 unidecode
+wenetruntime

From f171552ee3bbc0d334a9a360cebaa3973bf526d5 Mon Sep 17 00:00:00 2001
From: shuaigezhu <zhuyufengca@gmail.com>
Date: Thu, 24 Nov 2022 10:50:38 +0800
Subject: [PATCH 20/29] updated

---
 modelscope/models/nlp/codegeex/__init__.py    |  2 +-
 modelscope/models/nlp/codegeex/codegeex.py    |  2 +-
 .../codegeex/codegeex_for_code_translation.py | 43 ++++++-------------
 modelscope/models/nlp/codegeex/inference.py   | 41 ++----------------
 modelscope/models/nlp/codegeex/tokenizer.py   |  4 +-
 .../nlp/codegeex_code_translation_pipeline.py | 17 ++++----
 modelscope/preprocessors/nlp/__init__.py      |  2 -
 .../nlp/codegeex_preprocessor.py              | 25 -----------
 .../test_CodeGeeX_code_translation.py         |  6 +--
 9 files changed, 29 insertions(+), 113 deletions(-)
 delete mode 100755 modelscope/preprocessors/nlp/codegeex_preprocessor.py

diff --git a/modelscope/models/nlp/codegeex/__init__.py b/modelscope/models/nlp/codegeex/__init__.py
index 6ee72f80..08add0b0 100755
--- a/modelscope/models/nlp/codegeex/__init__.py
+++ b/modelscope/models/nlp/codegeex/__init__.py
@@ -1,6 +1,6 @@
 # Modified by Zhipu.AI
 # Original Copyright (c) Alibaba, Inc. and its affiliates.
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Union
 
 from modelscope.utils.import_utils import LazyImportModule
 
diff --git a/modelscope/models/nlp/codegeex/codegeex.py b/modelscope/models/nlp/codegeex/codegeex.py
index 7a1b76a3..f8d43008 100755
--- a/modelscope/models/nlp/codegeex/codegeex.py
+++ b/modelscope/models/nlp/codegeex/codegeex.py
@@ -1,8 +1,8 @@
+# Copyright (c) 2022 Zhipu.AI
 import math
 
 import torch
 import torch.nn.functional as F
-from torch.nn.parameter import Parameter
 
 
 def fast_gelu(x):
diff --git a/modelscope/models/nlp/codegeex/codegeex_for_code_translation.py b/modelscope/models/nlp/codegeex/codegeex_for_code_translation.py
index 0e9d161b..be3e79f0 100755
--- a/modelscope/models/nlp/codegeex/codegeex_for_code_translation.py
+++ b/modelscope/models/nlp/codegeex/codegeex_for_code_translation.py
@@ -1,20 +1,15 @@
 # Copyright (c) 2022 Zhipu.AI
-
 import copy
-import os
-import random
-import time
-from typing import Dict
+from typing import Any, Dict
 
-import numpy as np
 import torch
-from IPython import embed
 
 from modelscope.metainfo import Models
-from modelscope.models.base import Tensor, TorchModel
+from modelscope.models.base import TorchModel
 from modelscope.models.builder import MODELS
 from modelscope.outputs import OutputKeys
-from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
 from .codegeex import CodeGeeXModel
 from .inference import get_token_stream
 from .tokenizer import CodeGeeXTokenizer
@@ -45,18 +40,18 @@ class CodeGeeXForCodeTranslation(TorchModel):
             model_dir (str): the model path.
         """
         super().__init__(model_dir, *args, **kwargs)
-
+        logger = get_logger()
         # loading tokenizer
-        print('Loading tokenizer ...')
+        logger.info('Loading tokenizer ...')
         self.tokenizer = CodeGeeXTokenizer(
             tokenizer_path=model_dir + '/tokenizer', mode='codegeex-13b')
         # loading model
         state_dict_path = model_dir + '/ckpt_ms_translation_0817.pt'
-        print('Loading state dict ...')
+        logger.info('Loading state dict ...')
         state_dict = torch.load(state_dict_path, map_location='cpu')
         state_dict = state_dict['module']
 
-        print('Building CodeGeeX model ...')
+        logger.info('Building CodeGeeX model ...')
         self.model = model_provider()
         self.model.load_state_dict(state_dict)
         self.model.eval()
@@ -68,21 +63,16 @@ class CodeGeeXForCodeTranslation(TorchModel):
         seq_length = 2048
         out_seq_length = 256
         bad_ids = None
-        print('Generating ...')
         src_lang = input['source language']
         dst_lang = input['target language']
         prompt = input['prompt']
         prompt = f'code translation\n{src_lang}:\n{prompt}\n{dst_lang}:\n'
-        t0 = time.perf_counter()
+        logger = get_logger()
         tokenizer = self.tokenizer
         model = self.model
         for prompt in [prompt]:
             tokens = tokenizer.encode_code(prompt)
-            print(tokens)
-            print('Current prompt:')
-            print(prompt)
             n_token_prompt = len(tokens)
-            print('N_token_prompt:', n_token_prompt)
             token_stream = get_token_stream(
                 model,
                 tokenizer,
@@ -108,19 +98,10 @@ class CodeGeeXForCodeTranslation(TorchModel):
                         generated_code = tokenizer.decode_code(
                             generated_tokens_[n_token_prompt:])
                         generated_code = ''.join(generated_code)
-                        t1 = time.perf_counter()
-                        print('Total generation time:', t1 - t0, '# Tokens:',
-                              len(generated_tokens_) - n_token_prompt)
-                        print(
-                            f'{(t1 - t0) / (len(generated_tokens_) - n_token_prompt)}s/token'
-                        )
-                        print(
-                            '================================= Generated code:'
-                        )
-                        print(generated_code)
-                        t0 = time.perf_counter()
+                        logger.info('================================= Generated code:')
+                        logger.info(generated_code)
                     if all(is_finished):
                         break
 
-        print('Generation finished.')
+        logger.info('Generation finished.')
         return {OutputKeys.TEXT: generated_code}
diff --git a/modelscope/models/nlp/codegeex/inference.py b/modelscope/models/nlp/codegeex/inference.py
index 76a9458b..d058f023 100755
--- a/modelscope/models/nlp/codegeex/inference.py
+++ b/modelscope/models/nlp/codegeex/inference.py
@@ -1,12 +1,8 @@
-import copy
-import os
-import time
-import typing
-from dataclasses import dataclass
+# Copyright (c) 2022 Zhipu.AI
 
-import json
 import torch
 import torch.nn.functional as F
+from typing import List
 
 
 def get_ltor_masks_and_position_ids(
@@ -128,38 +124,7 @@ def pad_batch(batch, pad_id, seq_length):
             tokens.extend([pad_id] * (seq_length - context_length))
         context_lengths.append(context_length)
     return batch, context_lengths
-
-
-def forward_step(
-    model,
-    tokens,
-    seq_length,
-    position_ids,
-    attention_mask,
-    layer_past=None,
-    get_key_value=None,
-    prompt_length=None,
-    context_length=None,
-):
-    # Forward pass through the model.
-    output_tensor = model(
-        tokens,
-        position_ids,
-        attention_mask,
-        layer_past=layer_past,
-        get_key_value=get_key_value,
-        prompt_length=prompt_length,
-        context_length=context_length,
-    )
-
-    if get_key_value:
-        output_tensor, layer_past = output_tensor
-
-    if get_key_value:
-        return output_tensor, layer_past
-
-    return output_tensor
-
+    
 
 def get_token_stream(
     model,
diff --git a/modelscope/models/nlp/codegeex/tokenizer.py b/modelscope/models/nlp/codegeex/tokenizer.py
index 66958d7d..cc507eb6 100755
--- a/modelscope/models/nlp/codegeex/tokenizer.py
+++ b/modelscope/models/nlp/codegeex/tokenizer.py
@@ -1,8 +1,8 @@
-import typing
-
+# Copyright (c) 2022 Zhipu.AI
 import torch
 from transformers import AutoTokenizer
 from transformers.models.gpt2 import GPT2TokenizerFast
+from typing import List, Union
 
 
 def encode_whitespaces(text, start_extra_id: int, max_len: int):
diff --git a/modelscope/pipelines/nlp/codegeex_code_translation_pipeline.py b/modelscope/pipelines/nlp/codegeex_code_translation_pipeline.py
index 3c7374da..f2bce381 100755
--- a/modelscope/pipelines/nlp/codegeex_code_translation_pipeline.py
+++ b/modelscope/pipelines/nlp/codegeex_code_translation_pipeline.py
@@ -1,13 +1,12 @@
 # Copyright (c) 2022 Zhipu.AI
 
-from typing import Any, Dict, Optional, Union
+from typing import Any, Dict, Union
 
 from modelscope.metainfo import Pipelines
-from modelscope.models.base import Model
 from modelscope.models.nlp import CodeGeeXForCodeTranslation
-from modelscope.pipelines.base import Pipeline, Tensor
+from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import CodeGeeXPreprocessor, Preprocessor
+from modelscope.preprocessors import Preprocessor
 from modelscope.utils.constant import Tasks
 
 
@@ -27,16 +26,18 @@ class CodeGeeXCodeTranslationPipeline(Pipeline):
         self.model.eval()
         self.model.half()
         self.model.cuda()
-        if preprocessor is None:
-            preprocessor = CodeGeeXPreprocessor()
-        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+
+        super().__init__(model=model, **kwargs)
+        
+    def preprocess(self, inputs, **preprocess_params) -> Dict[str, Any]:
+           return inputs
 
     # define the forward pass
     def forward(self, inputs: Union[Dict], **forward_params) -> Dict[str, Any]:
         # check input format
         for para in ['prompt', 'source language', 'target language']:
             if para not in inputs:
-                return ('please check your input format.')
+                raise Exception('please check your input format.')
         return self.model(inputs)
 
     # format the outputs from pipeline
diff --git a/modelscope/preprocessors/nlp/__init__.py b/modelscope/preprocessors/nlp/__init__.py
index 2121543a..7c48fb3c 100644
--- a/modelscope/preprocessors/nlp/__init__.py
+++ b/modelscope/preprocessors/nlp/__init__.py
@@ -30,7 +30,6 @@ if TYPE_CHECKING:
     from .space_T_en import ConversationalTextToSqlPreprocessor
     from .space_T_cn import TableQuestionAnsweringPreprocessor
     from .mglm_summarization_preprocessor import MGLMSummarizationPreprocessor
-    from .codegeex_preprocessor import CodeGeeXPreprocessor
 else:
     _import_structure = {
         'nlp_base': [
@@ -65,7 +64,6 @@ else:
             'TextErrorCorrectionPreprocessor',
         ],
         'mglm_summarization_preprocessor': ['MGLMSummarizationPreprocessor'],
-        'codegeex_preprocessor': ['CodeGeeXPreprocessor'],
         'token_classification_thai_preprocessor': [
             'NERPreprocessorThai',
             'WordSegmentationPreprocessorThai',
diff --git a/modelscope/preprocessors/nlp/codegeex_preprocessor.py b/modelscope/preprocessors/nlp/codegeex_preprocessor.py
deleted file mode 100755
index f5f462f6..00000000
--- a/modelscope/preprocessors/nlp/codegeex_preprocessor.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright (c) 2022 Zhipu.AI
-
-import re
-from typing import Any, Dict, Iterable, Optional, Tuple, Union
-
-from modelscope.metainfo import Models, Preprocessors
-from modelscope.preprocessors.base import Preprocessor
-from modelscope.preprocessors.builder import PREPROCESSORS
-from modelscope.utils.constant import Fields, InputFields, ModeKeys, ModelFile
-from modelscope.utils.type_assert import type_assert
-
-
-@PREPROCESSORS.register_module(Fields.nlp, module_name=Preprocessors.codegeex)
-class CodeGeeXPreprocessor(Preprocessor):
-
-    def __init__(self, *args, **kwargs):
-        """preprocess the data
-        Args:
-            model_dir (str): model path
-        """
-        super().__init__(*args, **kwargs)
-
-    @type_assert(object, (str, tuple, Dict))
-    def __call__(self, data: Union[str, tuple, Dict]) -> Dict[str, Any]:
-        return data
diff --git a/tests/pipelines/test_CodeGeeX_code_translation.py b/tests/pipelines/test_CodeGeeX_code_translation.py
index d2fd5369..a56ae00e 100644
--- a/tests/pipelines/test_CodeGeeX_code_translation.py
+++ b/tests/pipelines/test_CodeGeeX_code_translation.py
@@ -2,9 +2,7 @@
 import os
 import unittest
 
-from modelscope.models import Model
 from modelscope.pipelines import pipeline
-from modelscope.preprocessors import CodeGeeXPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
@@ -19,11 +17,9 @@ class CodeGeeXCodeTranslationTest(unittest.TestCase, DemoCompatibilityCheck):
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_CodeGeeX_with_name(self):
         model = 'ZhipuAI/CodeGeeX-Code-Translation-13B'
-        preprocessor = CodeGeeXPreprocessor()
         pipe = pipeline(
             task=Tasks.code_translation,
-            model=model,
-            preprocessor=preprocessor,
+            model=model
         )
         inputs = {
             'prompt': 'for i in range(10):\n\tprint(i)\n',

From 1ab8a1f764b33b7be174619520af2a2f8958ffbe Mon Sep 17 00:00:00 2001
From: shuaigezhu <zhuyufengca@gmail.com>
Date: Thu, 24 Nov 2022 11:20:25 +0800
Subject: [PATCH 21/29] updated

---
 .../models/nlp/codegeex/codegeex_for_code_translation.py     | 4 +++-
 modelscope/models/nlp/codegeex/inference.py                  | 5 +++--
 modelscope/models/nlp/codegeex/tokenizer.py                  | 3 ++-
 .../pipelines/nlp/codegeex_code_translation_pipeline.py      | 4 ++--
 tests/pipelines/test_CodeGeeX_code_translation.py            | 5 +----
 5 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/modelscope/models/nlp/codegeex/codegeex_for_code_translation.py b/modelscope/models/nlp/codegeex/codegeex_for_code_translation.py
index be3e79f0..fece907d 100755
--- a/modelscope/models/nlp/codegeex/codegeex_for_code_translation.py
+++ b/modelscope/models/nlp/codegeex/codegeex_for_code_translation.py
@@ -98,7 +98,9 @@ class CodeGeeXForCodeTranslation(TorchModel):
                         generated_code = tokenizer.decode_code(
                             generated_tokens_[n_token_prompt:])
                         generated_code = ''.join(generated_code)
-                        logger.info('================================= Generated code:')
+                        logger.info(
+                            '================================= Generated code:'
+                        )
                         logger.info(generated_code)
                     if all(is_finished):
                         break
diff --git a/modelscope/models/nlp/codegeex/inference.py b/modelscope/models/nlp/codegeex/inference.py
index d058f023..38f14d6c 100755
--- a/modelscope/models/nlp/codegeex/inference.py
+++ b/modelscope/models/nlp/codegeex/inference.py
@@ -1,8 +1,9 @@
 # Copyright (c) 2022 Zhipu.AI
 
+from typing import List
+
 import torch
 import torch.nn.functional as F
-from typing import List
 
 
 def get_ltor_masks_and_position_ids(
@@ -124,7 +125,7 @@ def pad_batch(batch, pad_id, seq_length):
             tokens.extend([pad_id] * (seq_length - context_length))
         context_lengths.append(context_length)
     return batch, context_lengths
-    
+
 
 def get_token_stream(
     model,
diff --git a/modelscope/models/nlp/codegeex/tokenizer.py b/modelscope/models/nlp/codegeex/tokenizer.py
index cc507eb6..a5da9a3c 100755
--- a/modelscope/models/nlp/codegeex/tokenizer.py
+++ b/modelscope/models/nlp/codegeex/tokenizer.py
@@ -1,8 +1,9 @@
 # Copyright (c) 2022 Zhipu.AI
+from typing import List, Union
+
 import torch
 from transformers import AutoTokenizer
 from transformers.models.gpt2 import GPT2TokenizerFast
-from typing import List, Union
 
 
 def encode_whitespaces(text, start_extra_id: int, max_len: int):
diff --git a/modelscope/pipelines/nlp/codegeex_code_translation_pipeline.py b/modelscope/pipelines/nlp/codegeex_code_translation_pipeline.py
index f2bce381..ef0f29e0 100755
--- a/modelscope/pipelines/nlp/codegeex_code_translation_pipeline.py
+++ b/modelscope/pipelines/nlp/codegeex_code_translation_pipeline.py
@@ -28,9 +28,9 @@ class CodeGeeXCodeTranslationPipeline(Pipeline):
         self.model.cuda()
 
         super().__init__(model=model, **kwargs)
-        
+
     def preprocess(self, inputs, **preprocess_params) -> Dict[str, Any]:
-           return inputs
+        return inputs
 
     # define the forward pass
     def forward(self, inputs: Union[Dict], **forward_params) -> Dict[str, Any]:
diff --git a/tests/pipelines/test_CodeGeeX_code_translation.py b/tests/pipelines/test_CodeGeeX_code_translation.py
index a56ae00e..0972c494 100644
--- a/tests/pipelines/test_CodeGeeX_code_translation.py
+++ b/tests/pipelines/test_CodeGeeX_code_translation.py
@@ -17,10 +17,7 @@ class CodeGeeXCodeTranslationTest(unittest.TestCase, DemoCompatibilityCheck):
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_CodeGeeX_with_name(self):
         model = 'ZhipuAI/CodeGeeX-Code-Translation-13B'
-        pipe = pipeline(
-            task=Tasks.code_translation,
-            model=model
-        )
+        pipe = pipeline(task=Tasks.code_translation, model=model)
         inputs = {
             'prompt': 'for i in range(10):\n\tprint(i)\n',
             'source language': 'Python',

From 2605824dea612f2780ccbabb9ba7cf53bc89bfb8 Mon Sep 17 00:00:00 2001
From: pengzhendong <275331498@qq.com>
Date: Wed, 23 Nov 2022 21:58:03 +0800
Subject: [PATCH 22/29] [tests] add unittest

---
 .../asr/wenet_automatic_speech_recognition.py |  23 ++-
 .../audio/asr_wenet_inference_pipeline.py     |  14 +-
 ...test_wenet_automatic_speech_recognition.py | 131 ++++++++++++++++++
 3 files changed, 146 insertions(+), 22 deletions(-)
 create mode 100644 tests/pipelines/test_wenet_automatic_speech_recognition.py

diff --git a/modelscope/models/audio/asr/wenet_automatic_speech_recognition.py b/modelscope/models/audio/asr/wenet_automatic_speech_recognition.py
index 7db11190..1947629f 100644
--- a/modelscope/models/audio/asr/wenet_automatic_speech_recognition.py
+++ b/modelscope/models/audio/asr/wenet_automatic_speech_recognition.py
@@ -8,6 +8,7 @@ from modelscope.models.base import Model
 from modelscope.models.builder import MODELS
 from modelscope.utils.constant import Tasks
 
+import json
 import wenetruntime as wenet
 
 __all__ = ['WeNetAutomaticSpeechRecognition']
@@ -23,23 +24,15 @@ class WeNetAutomaticSpeechRecognition(Model):
 
         Args:
             model_dir (str): the model path.
-            am_model_name (str): the am model name from configuration.json
-            model_config (Dict[str, Any]): the detail config about model from configuration.json
         """
         super().__init__(model_dir, am_model_name, model_config, *args,
                          **kwargs)
-        self.model_cfg = {
-            # the recognition model dir path
-            'model_dir': model_dir,
-            # the recognition model config dict
-            'model_config': model_config
-        }
-        self.decoder = None
-
-    def forward(self) -> Dict[str, Any]:
-        """preload model and return the info of the model
-        """
-        model_dir = self.model_cfg['model_dir']
         self.decoder = wenet.Decoder(model_dir, lang='chs')
 
-        return self.model_cfg
+    def forward(self, inputs: Dict[str, Any]) -> str:
+        if inputs['audio_format'] == 'wav':
+            rst = self.decoder.decode_wav(inputs['audio'])
+        else:
+            rst = self.decoder.decode(inputs['audio'])
+        text = json.loads(rst)['nbest'][0]['sentence']
+        return {'text': text}
diff --git a/modelscope/pipelines/audio/asr_wenet_inference_pipeline.py b/modelscope/pipelines/audio/asr_wenet_inference_pipeline.py
index 33e8c617..6df47bcb 100644
--- a/modelscope/pipelines/audio/asr_wenet_inference_pipeline.py
+++ b/modelscope/pipelines/audio/asr_wenet_inference_pipeline.py
@@ -29,8 +29,6 @@ class WeNetAutomaticSpeechRecognitionPipeline(Pipeline):
         """use `model` and `preprocessor` to create an asr pipeline for prediction
         """
         super().__init__(model=model, preprocessor=preprocessor, **kwargs)
-        self.model_cfg = self.model.forward()
-        self.decoder = self.model.decoder
 
     def __call__(self,
                  audio_in: Union[str, bytes],
@@ -68,17 +66,19 @@ class WeNetAutomaticSpeechRecognitionPipeline(Pipeline):
             if checking_audio_fs is not None:
                 self.audio_fs = checking_audio_fs
 
-        self.model_cfg['audio'] = self.audio_in
-        self.model_cfg['audio_fs'] = self.audio_fs
-
-        output = self.forward(self.model_cfg)
+        inputs = {
+            'audio': self.audio_in,
+            'audio_format': self.audio_format,
+            'audio_fs': self.audio_fs
+        }
+        output = self.forward(inputs)
         rst = self.postprocess(output['asr_result'])
         return rst
 
     def forward(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
         """Decoding
         """
-        inputs['asr_result'] = self.decoder.decode(inputs['audio'])
+        inputs['asr_result'] = self.model(inputs)
         return inputs
 
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
diff --git a/tests/pipelines/test_wenet_automatic_speech_recognition.py b/tests/pipelines/test_wenet_automatic_speech_recognition.py
new file mode 100644
index 00000000..4adf8119
--- /dev/null
+++ b/tests/pipelines/test_wenet_automatic_speech_recognition.py
@@ -0,0 +1,131 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import shutil
+import unittest
+from typing import Any, Dict, Union
+
+import numpy as np
+import soundfile
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import ColorCodes, Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.logger import get_logger
+from modelscope.utils.test_utils import download_and_untar, test_level
+
+logger = get_logger()
+
+WAV_FILE = 'data/test/audios/asr_example.wav'
+URL_FILE = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example.wav'
+
+
+class WeNetAutomaticSpeechRecognitionTest(unittest.TestCase,
+                                          DemoCompatibilityCheck):
+    action_info = {
+        'test_run_with_pcm': {
+            'checking_item': OutputKeys.TEXT,
+            'example': 'wav_example'
+        },
+        'test_run_with_url': {
+            'checking_item': OutputKeys.TEXT,
+            'example': 'wav_example'
+        },
+        'test_run_with_wav': {
+            'checking_item': OutputKeys.TEXT,
+            'example': 'wav_example'
+        },
+        'wav_example': {
+            'text': '每一天都要快乐喔'
+        }
+    }
+
+    def setUp(self) -> None:
+        self.am_model_id = 'wenet/u2pp_conformer-asr-cn-16k-online'
+        # this temporary workspace dir will store waveform files
+        self.workspace = os.path.join(os.getcwd(), '.tmp')
+        self.task = Tasks.auto_speech_recognition
+        if not os.path.exists(self.workspace):
+            os.mkdir(self.workspace)
+
+    def tearDown(self) -> None:
+        # remove workspace dir (.tmp)
+        shutil.rmtree(self.workspace, ignore_errors=True)
+
+    def run_pipeline(self,
+                     model_id: str,
+                     audio_in: Union[str, bytes],
+                     sr: int = None) -> Dict[str, Any]:
+        inference_16k_pipline = pipeline(
+            task=Tasks.auto_speech_recognition, model=model_id)
+        rec_result = inference_16k_pipline(audio_in, audio_fs=sr)
+        return rec_result
+
+    def log_error(self, functions: str, result: Dict[str, Any]) -> None:
+        logger.error(ColorCodes.MAGENTA + functions + ': FAILED.'
+                     + ColorCodes.END)
+        logger.error(
+            ColorCodes.MAGENTA + functions + ' correct result example:'
+            + ColorCodes.YELLOW
+            + str(self.action_info[self.action_info[functions]['example']])
+            + ColorCodes.END)
+        raise ValueError('asr result is mismatched')
+
+    def check_result(self, functions: str, result: Dict[str, Any]) -> None:
+        if result.__contains__(self.action_info[functions]['checking_item']):
+            logger.info(ColorCodes.MAGENTA + functions + ': SUCCESS.'
+                        + ColorCodes.END)
+            logger.info(
+                ColorCodes.YELLOW
+                + str(result[self.action_info[functions]['checking_item']])
+                + ColorCodes.END)
+        else:
+            self.log_error(functions, result)
+
+    def wav2bytes(self, wav_file):
+        audio, fs = soundfile.read(wav_file)
+
+        # float32 -> int16
+        audio = np.asarray(audio)
+        dtype = np.dtype('int16')
+        i = np.iinfo(dtype)
+        abs_max = 2**(i.bits - 1)
+        offset = i.min + abs_max
+        audio = (audio * abs_max + offset).clip(i.min, i.max).astype(dtype)
+
+        # int16(PCM_16) -> byte
+        audio = audio.tobytes()
+        return audio, fs
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_pcm(self):
+        """run with wav data
+        """
+        logger.info('Run ASR test with wav data (wenet)...')
+        audio, sr = self.wav2bytes(os.path.join(os.getcwd(), WAV_FILE))
+        rec_result = self.run_pipeline(
+            model_id=self.am_model_id, audio_in=audio, sr=sr)
+        self.check_result('test_run_with_pcm', rec_result)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_wav(self):
+        """run with single waveform file
+        """
+        logger.info('Run ASR test with waveform file (wenet)...')
+        wav_file_path = os.path.join(os.getcwd(), WAV_FILE)
+        rec_result = self.run_pipeline(
+            model_id=self.am_model_id, audio_in=wav_file_path)
+        self.check_result('test_run_with_wav', rec_result)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_url(self):
+        """run with single url file
+        """
+        logger.info('Run ASR test with url file (wenet)...')
+        rec_result = self.run_pipeline(
+            model_id=self.am_model_id, audio_in=URL_FILE)
+        self.check_result('test_run_with_url', rec_result)
+
+
+if __name__ == '__main__':
+    unittest.main()

From eb2ef3a1cfc7ec511e73cc37d7d66a544dc59dfb Mon Sep 17 00:00:00 2001
From: pengzhendong <275331498@qq.com>
Date: Thu, 24 Nov 2022 19:48:48 +0800
Subject: [PATCH 23/29] [lint] fix lint

---
 .../models/audio/asr/wenet_automatic_speech_recognition.py  | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/modelscope/models/audio/asr/wenet_automatic_speech_recognition.py b/modelscope/models/audio/asr/wenet_automatic_speech_recognition.py
index 1947629f..feb822d4 100644
--- a/modelscope/models/audio/asr/wenet_automatic_speech_recognition.py
+++ b/modelscope/models/audio/asr/wenet_automatic_speech_recognition.py
@@ -3,14 +3,14 @@
 import os
 from typing import Any, Dict
 
+import json
+import wenetruntime as wenet
+
 from modelscope.metainfo import Models
 from modelscope.models.base import Model
 from modelscope.models.builder import MODELS
 from modelscope.utils.constant import Tasks
 
-import json
-import wenetruntime as wenet
-
 __all__ = ['WeNetAutomaticSpeechRecognition']
 
 

From b0cf09d7b0bf25e110f6fb52aa77161f6cd1deea Mon Sep 17 00:00:00 2001
From: pengzhendong <275331498@qq.com>
Date: Thu, 24 Nov 2022 22:12:58 +0800
Subject: [PATCH 24/29] [ci] chang pypi url to tsinghua

---
 .dev_scripts/ci_container_test.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.dev_scripts/ci_container_test.sh b/.dev_scripts/ci_container_test.sh
index a3f13137..35b43535 100644
--- a/.dev_scripts/ci_container_test.sh
+++ b/.dev_scripts/ci_container_test.sh
@@ -1,4 +1,5 @@
 if [ "$MODELSCOPE_SDK_DEBUG" == "True" ]; then
+    pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
     pip install -r requirements/tests.txt
     git config --global --add safe.directory /Maas-lib
     git config --global user.email tmp

From a2532210af2712aa87ff0a72065ed84e567779f8 Mon Sep 17 00:00:00 2001
From: pengzhendong <275331498@qq.com>
Date: Fri, 25 Nov 2022 11:47:25 +0800
Subject: [PATCH 25/29] fix wenetruntime version

---
 requirements/audio.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/requirements/audio.txt b/requirements/audio.txt
index 86c78d3c..037bb839 100644
--- a/requirements/audio.txt
+++ b/requirements/audio.txt
@@ -25,4 +25,5 @@ torchaudio
 tqdm
 ttsfrd>=0.0.3
 unidecode
-wenetruntime
+# wenetruntime version should be the same as torch
+wenetruntime==1.11

From 65adde14d8b2f6e13cc44983b439e319d0a7cf66 Mon Sep 17 00:00:00 2001
From: shuaigezhu <zhuyufengca@gmail.com>
Date: Fri, 25 Nov 2022 11:55:53 +0800
Subject: [PATCH 26/29] remove uttest

---
 .../test_CodeGeeX_code_translation.py         | 31 -------------------
 1 file changed, 31 deletions(-)
 delete mode 100644 tests/pipelines/test_CodeGeeX_code_translation.py

diff --git a/tests/pipelines/test_CodeGeeX_code_translation.py b/tests/pipelines/test_CodeGeeX_code_translation.py
deleted file mode 100644
index 0972c494..00000000
--- a/tests/pipelines/test_CodeGeeX_code_translation.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-import os
-import unittest
-
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks
-from modelscope.utils.demo_utils import DemoCompatibilityCheck
-from modelscope.utils.test_utils import test_level
-
-
-class CodeGeeXCodeTranslationTest(unittest.TestCase, DemoCompatibilityCheck):
-
-    def setUp(self) -> None:
-        self.output_dir = 'unittest_output'
-        os.makedirs(self.output_dir, exist_ok=True)
-
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
-    def test_run_with_CodeGeeX_with_name(self):
-        model = 'ZhipuAI/CodeGeeX-Code-Translation-13B'
-        pipe = pipeline(task=Tasks.code_translation, model=model)
-        inputs = {
-            'prompt': 'for i in range(10):\n\tprint(i)\n',
-            'source language': 'Python',
-            'target language': 'C++'
-        }
-        result = pipe(inputs)
-        print(result)
-
-
-if __name__ == '__main__':
-    unittest.main()

From 02d2469e55347c95349820caf660f2df1128fb58 Mon Sep 17 00:00:00 2001
From: pengzhendong <275331498@qq.com>
Date: Fri, 25 Nov 2022 15:37:45 +0800
Subject: [PATCH 27/29] check wenetruntime

---
 modelscope/utils/error.py        | 5 +++++
 modelscope/utils/import_utils.py | 7 +++++++
 requirements/audio.txt           | 2 --
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/modelscope/utils/error.py b/modelscope/utils/error.py
index a894063c..8128f7b0 100644
--- a/modelscope/utils/error.py
+++ b/modelscope/utils/error.py
@@ -70,6 +70,11 @@ PYTORCH_IMPORT_ERROR = """
 installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
 """
 
+WENETRUNTIME_IMPORT_ERROR = """
+{0} requires the wenetruntime library but it was not found in your environment. You can install it with pip:
+`pip install wenetruntime==TORCH_VER`
+"""
+
 # docstyle-ignore
 SCIPY_IMPORT_ERROR = """
 {0} requires the scipy library but it was not found in your environment. You can install it with pip:
diff --git a/modelscope/utils/import_utils.py b/modelscope/utils/import_utils.py
index 5db5ea98..64072eee 100644
--- a/modelscope/utils/import_utils.py
+++ b/modelscope/utils/import_utils.py
@@ -245,6 +245,10 @@ def is_torch_cuda_available():
         return False
 
 
+def is_wenetruntime_available():
+    return importlib.util.find_spec('wenetruntime') is not None
+
+
 def is_tf_available():
     return _tf_available
 
@@ -280,6 +284,9 @@ REQUIREMENTS_MAAPING = OrderedDict([
     ('timm', (is_timm_available, TIMM_IMPORT_ERROR)),
     ('tokenizers', (is_tokenizers_available, TOKENIZERS_IMPORT_ERROR)),
     ('torch', (is_torch_available, PYTORCH_IMPORT_ERROR)),
+    ('wenetruntime',
+     (is_wenetruntime_available,
+      WENETRUNTIME_IMPORT_ERROR.replace('TORCH_VER', _torch_version))),
     ('scipy', (is_scipy_available, SCIPY_IMPORT_ERROR)),
     ('cv2', (is_opencv_available, OPENCV_IMPORT_ERROR)),
     ('PIL', (is_pillow_available, PILLOW_IMPORT_ERROR)),
diff --git a/requirements/audio.txt b/requirements/audio.txt
index 037bb839..bef32121 100644
--- a/requirements/audio.txt
+++ b/requirements/audio.txt
@@ -25,5 +25,3 @@ torchaudio
 tqdm
 ttsfrd>=0.0.3
 unidecode
-# wenetruntime version should be the same as torch
-wenetruntime==1.11

From c9064caa58d7e207834478423a66bf82025e23e0 Mon Sep 17 00:00:00 2001
From: shuaigezhu <zhuyufengca@gmail.com>
Date: Fri, 25 Nov 2022 16:35:19 +0800
Subject: [PATCH 28/29] add code_generation

---
 modelscope/metainfo.py                        |   2 +-
 modelscope/models/nlp/__init__.py             |   4 +-
 modelscope/models/nlp/codegeex/__init__.py    |   2 +
 .../codegeex/codegeex_for_code_generation.py  | 111 ++++++++++++++++++
 modelscope/pipelines/nlp/__init__.py          |   3 +
 .../nlp/codegeex_code_generation_pipeline.py  |  48 ++++++++
 .../nlp/codegeex_code_translation_pipeline.py |   6 +
 modelscope/utils/constant.py                  |   1 +
 8 files changed, 174 insertions(+), 3 deletions(-)
 create mode 100755 modelscope/models/nlp/codegeex/codegeex_for_code_generation.py
 create mode 100755 modelscope/pipelines/nlp/codegeex_code_generation_pipeline.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 99f4a047..c74eaeb2 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -257,6 +257,7 @@ class Pipelines(object):
     feature_extraction = 'feature-extraction'
     mglm_text_summarization = 'mglm-text-summarization'
     codegeex_code_translation = 'codegeex-code-translation'
+    codegeex_code_generation = 'codegeex-code-generation'
     translation_en_to_de = 'translation_en_to_de'  # keep it underscore
     translation_en_to_ro = 'translation_en_to_ro'  # keep it underscore
     translation_en_to_fr = 'translation_en_to_fr'  # keep it underscore
@@ -384,7 +385,6 @@ class Preprocessors(object):
     document_segmentation = 'document-segmentation'
     feature_extraction = 'feature-extraction'
     mglm_summarization = 'mglm-summarization'
-    codegeex = 'codegeex'
     sentence_piece = 'sentence-piece'
 
     # audio preprocessor
diff --git a/modelscope/models/nlp/__init__.py b/modelscope/models/nlp/__init__.py
index 3f9d224c..5f8b88f9 100644
--- a/modelscope/models/nlp/__init__.py
+++ b/modelscope/models/nlp/__init__.py
@@ -36,7 +36,7 @@ if TYPE_CHECKING:
     )
     from .T5 import T5ForConditionalGeneration
     from .mglm import MGLMForTextSummarization
-    from .codegeex import CodeGeeXForCodeTranslation
+    from .codegeex import CodeGeeXForCodeTranslation, CodeGeeXForCodeGeneration
     from .task_models import (
         FeatureExtractionModel,
         InformationExtractionModel,
@@ -109,7 +109,7 @@ else:
         'sentence_embedding': ['SentenceEmbedding'],
         'T5': ['T5ForConditionalGeneration'],
         'mglm': ['MGLMForTextSummarization'],
-        'codegeex': ['CodeGeeXForCodeTranslation'],
+        'codegeex': ['CodeGeeXForCodeTranslation', 'CodeGeeXForCodeGeneration'],
         'gpt_neo': ['GPTNeoModel'],
         'bloom': ['BloomModel'],
     }
diff --git a/modelscope/models/nlp/codegeex/__init__.py b/modelscope/models/nlp/codegeex/__init__.py
index 08add0b0..0bcdb4bc 100755
--- a/modelscope/models/nlp/codegeex/__init__.py
+++ b/modelscope/models/nlp/codegeex/__init__.py
@@ -6,9 +6,11 @@ from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
     from .codegeex_for_code_translation import CodeGeeXForCodeTranslation
+    from .codegeex_for_code_generation import CodeGeeXForCodeGeneration
 else:
     _import_structure = {
         'codegeex_for_code_translation': ['CodeGeeXForCodeTranslation'],
+        'codegeex_for_code_generation': ['CodeGeeXForCodeGeneration'],
     }
 
     import sys
diff --git a/modelscope/models/nlp/codegeex/codegeex_for_code_generation.py b/modelscope/models/nlp/codegeex/codegeex_for_code_generation.py
new file mode 100755
index 00000000..dbe6d4a4
--- /dev/null
+++ b/modelscope/models/nlp/codegeex/codegeex_for_code_generation.py
@@ -0,0 +1,111 @@
+# Copyright (c) 2022 Zhipu.AI
+import copy
+from typing import Any, Dict
+
+import torch
+
+from modelscope.metainfo import Models
+from modelscope.models.base import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+from .codegeex import CodeGeeXModel
+from .inference import get_token_stream
+from .tokenizer import CodeGeeXTokenizer
+
+
+def model_provider():
+    """Build the model."""
+
+    hidden_size = 5120
+    num_attention_heads = 40
+    num_layers = 39
+    padded_vocab_size = 52224
+    max_position_embeddings = 2048
+
+    model = CodeGeeXModel(hidden_size, num_layers, num_attention_heads,
+                          padded_vocab_size, max_position_embeddings)
+
+    return model
+
+
+@MODELS.register_module(Tasks.code_generation, module_name=Models.codegeex)
+class CodeGeeXForCodeGeneration(TorchModel):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the fast poem model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+        """
+        super().__init__(model_dir, *args, **kwargs)
+        logger = get_logger()
+        # loading tokenizer
+        logger.info('Loading tokenizer ...')
+        self.tokenizer = CodeGeeXTokenizer(
+            tokenizer_path=model_dir + '/tokenizer', mode='codegeex-13b')
+        # loading model
+        state_dict_path = model_dir + '/ckpt_ms_213000_fp32_52224.pt'
+        logger.info('Loading state dict ...')
+        state_dict = torch.load(state_dict_path, map_location='cpu')
+        state_dict = state_dict['module']
+
+        logger.info('Building CodeGeeX model ...')
+        self.model = model_provider()
+        self.model.load_state_dict(state_dict)
+        self.model.eval()
+        self.model.half()
+        self.model.cuda()
+
+    def forward(self, input: Dict[str, str]) -> Dict[str, str]:
+        micro_batch_size = 1
+        seq_length = 2048
+        out_seq_length = 256
+        bad_ids = None
+        lang = input['language']
+        prompt = input['prompt']
+        prompt = f"# language: {lang}\n{prompt}"
+        logger = get_logger()
+        tokenizer = self.tokenizer
+        model = self.model
+        for prompt in [prompt]:
+            tokens = tokenizer.encode_code(prompt)
+            n_token_prompt = len(tokens)
+            token_stream = get_token_stream(
+                model,
+                tokenizer,
+                seq_length,
+                out_seq_length,
+                [copy.deepcopy(tokens) for _ in range(micro_batch_size)],
+                micro_batch_size=micro_batch_size,
+                bad_ids=bad_ids,
+                topk=1,
+                topp=0.9,
+                temperature=0.9,
+                greedy=True
+            )
+            is_finished = [False for _ in range(micro_batch_size)]
+            for i, generated in enumerate(token_stream):
+                generated_tokens = generated[0]
+                for j in range(micro_batch_size):
+                    if is_finished[j]:
+                        continue
+                    if generated_tokens[j].cpu().numpy(
+                    )[-1] == tokenizer.eos_token_id or len(
+                            generated_tokens[j]) >= out_seq_length:
+                        is_finished[j] = True
+                        generated_tokens_ = generated_tokens[j].cpu().numpy(
+                        ).tolist()
+                        generated_code = tokenizer.decode_code(
+                            generated_tokens_[n_token_prompt:])
+                        generated_code = ''.join(generated_code)
+                        logger.info(
+                            '================================= Generated code:'
+                        )
+                        logger.info(generated_code)
+                    if all(is_finished):
+                        break
+
+        logger.info('Generation finished.')
+        return {OutputKeys.TEXT: generated_code}
diff --git a/modelscope/pipelines/nlp/__init__.py b/modelscope/pipelines/nlp/__init__.py
index 3ffe7b93..cbea8436 100644
--- a/modelscope/pipelines/nlp/__init__.py
+++ b/modelscope/pipelines/nlp/__init__.py
@@ -33,6 +33,7 @@ if TYPE_CHECKING:
     from .zero_shot_classification_pipeline import ZeroShotClassificationPipeline
     from .mglm_text_summarization_pipeline import MGLMTextSummarizationPipeline
     from .codegeex_code_translation_pipeline import CodeGeeXCodeTranslationPipeline
+    from .codegeex_code_generation_pipeline import CodeGeeXCodeGenerationPipeline
     from .multilingual_word_segmentation_pipeline import MultilingualWordSegmentationPipeline, \
         WordSegmentationThaiPipeline
 
@@ -76,6 +77,8 @@ else:
         'mglm_text_summarization_pipeline': ['MGLMTextSummarizationPipeline'],
         'codegeex_code_translation_pipeline':
         ['CodeGeeXCodeTranslationPipeline'],
+        'codegeex_code_generation_pipeline':
+        ['CodeGeeXCodeGenerationPipeline'],
         'multilingual_word_segmentation_pipeline': [
             'MultilingualWordSegmentationPipeline',
             'WordSegmentationThaiPipeline'
diff --git a/modelscope/pipelines/nlp/codegeex_code_generation_pipeline.py b/modelscope/pipelines/nlp/codegeex_code_generation_pipeline.py
new file mode 100755
index 00000000..2eaebca3
--- /dev/null
+++ b/modelscope/pipelines/nlp/codegeex_code_generation_pipeline.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2022 Zhipu.AI
+
+from typing import Any, Dict, Union
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.nlp import CodeGeeXForCodeGeneration
+from modelscope.pipelines.base import Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import Preprocessor
+from modelscope.utils.constant import Tasks
+
+
+@PIPELINES.register_module(
+    group_key=Tasks.code_generation,
+    module_name=Pipelines.codegeex_code_generation)
+class CodeGeeXCodeGenerationPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[CodeGeeXForCodeGeneration, str],
+                 preprocessor: [Preprocessor] = None,
+                 *args,
+                 **kwargs):
+        model = CodeGeeXForCodeGeneration(model) if isinstance(model,
+                                                                str) else model
+        self.model = model
+        self.model.eval()
+        self.model.half()
+        self.model.cuda()
+
+        super().__init__(model=model, **kwargs)
+
+    def preprocess(self, inputs, **preprocess_params) -> Dict[str, Any]:
+        return inputs
+
+    # define the forward pass
+    def forward(self, inputs: Union[Dict], **forward_params) -> Dict[str, Any]:
+        # check input format
+        for para in ['prompt', 'language']:
+            if para not in inputs:
+                raise Exception('Please check your input format.')
+        if inputs['language'] not in ["C++","C","C#","Cuda","Objective-C","Objective-C++","Python","Java","Scala","TeX","HTML","PHP","JavaScript","TypeScript","Go","Shell","Rust","CSS","SQL","Kotlin","Pascal","R","Fortran","Lean"]: # noqa
+            raise Exception('Make sure the language is in ["C++","C","C#","Cuda","Objective-C","Objective-C++","Python","Java","Scala","TeX","HTML","PHP","JavaScript","TypeScript","Go","Shell","Rust","CSS","SQL","Kotlin","Pascal","R","Fortran","Lean"]') # noqa
+
+        return self.model(inputs)
+
+    # format the outputs from pipeline
+    def postprocess(self, input, **kwargs) -> Dict[str, Any]:
+        return input
diff --git a/modelscope/pipelines/nlp/codegeex_code_translation_pipeline.py b/modelscope/pipelines/nlp/codegeex_code_translation_pipeline.py
index ef0f29e0..61be5620 100755
--- a/modelscope/pipelines/nlp/codegeex_code_translation_pipeline.py
+++ b/modelscope/pipelines/nlp/codegeex_code_translation_pipeline.py
@@ -38,6 +38,12 @@ class CodeGeeXCodeTranslationPipeline(Pipeline):
         for para in ['prompt', 'source language', 'target language']:
             if para not in inputs:
                 raise Exception('please check your input format.')
+        if inputs['source language'] not in ["C++","C","C#","Cuda","Objective-C","Objective-C++","Python","Java","Scala","TeX","HTML","PHP","JavaScript","TypeScript","Go","Shell","Rust","CSS","SQL","Kotlin","Pascal","R","Fortran","Lean"]: # noqa
+            raise Exception('Make sure the source language is in ["C++","C","C#","Cuda","Objective-C","Objective-C++","Python","Java","Scala","TeX","HTML","PHP","JavaScript","TypeScript","Go","Shell","Rust","CSS","SQL","Kotlin","Pascal","R","Fortran","Lean"]') # noqa
+
+        if inputs['target language'] not in ["C++","C","C#","Cuda","Objective-C","Objective-C++","Python","Java","Scala","TeX","HTML","PHP","JavaScript","TypeScript","Go","Shell","Rust","CSS","SQL","Kotlin","Pascal","R","Fortran","Lean"]: # noqa
+            raise Exception('Make sure the target language is in ["C++","C","C#","Cuda","Objective-C","Objective-C++","Python","Java","Scala","TeX","HTML","PHP","JavaScript","TypeScript","Go","Shell","Rust","CSS","SQL","Kotlin","Pascal","R","Fortran","Lean"]') # noqa
+
         return self.model(inputs)
 
     # format the outputs from pipeline
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index bf3f8fb9..6cd7a571 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -121,6 +121,7 @@ class NLPTasks(object):
     text_summarization = 'text-summarization'
     question_answering = 'question-answering'
     code_translation = 'code-translation'
+    code_generation = 'code-generation'
     zero_shot_classification = 'zero-shot-classification'
     backbone = 'backbone'
     text_error_correction = 'text-error-correction'

From 028551cd62ee57c081c637dc32cc6a0a6e356dd2 Mon Sep 17 00:00:00 2001
From: shuaigezhu <zhuyufengca@gmail.com>
Date: Fri, 25 Nov 2022 16:41:44 +0800
Subject: [PATCH 29/29] add code_generation files

---
 modelscope/models/nlp/__init__.py             |  3 ++-
 .../codegeex/codegeex_for_code_generation.py  |  5 ++---
 .../nlp/codegeex_code_generation_pipeline.py  | 13 ++++++++---
 .../nlp/codegeex_code_translation_pipeline.py | 22 +++++++++++++++----
 4 files changed, 32 insertions(+), 11 deletions(-)

diff --git a/modelscope/models/nlp/__init__.py b/modelscope/models/nlp/__init__.py
index 5f8b88f9..3d4f8c7d 100644
--- a/modelscope/models/nlp/__init__.py
+++ b/modelscope/models/nlp/__init__.py
@@ -109,7 +109,8 @@ else:
         'sentence_embedding': ['SentenceEmbedding'],
         'T5': ['T5ForConditionalGeneration'],
         'mglm': ['MGLMForTextSummarization'],
-        'codegeex': ['CodeGeeXForCodeTranslation', 'CodeGeeXForCodeGeneration'],
+        'codegeex':
+        ['CodeGeeXForCodeTranslation', 'CodeGeeXForCodeGeneration'],
         'gpt_neo': ['GPTNeoModel'],
         'bloom': ['BloomModel'],
     }
diff --git a/modelscope/models/nlp/codegeex/codegeex_for_code_generation.py b/modelscope/models/nlp/codegeex/codegeex_for_code_generation.py
index dbe6d4a4..ff191cba 100755
--- a/modelscope/models/nlp/codegeex/codegeex_for_code_generation.py
+++ b/modelscope/models/nlp/codegeex/codegeex_for_code_generation.py
@@ -65,7 +65,7 @@ class CodeGeeXForCodeGeneration(TorchModel):
         bad_ids = None
         lang = input['language']
         prompt = input['prompt']
-        prompt = f"# language: {lang}\n{prompt}"
+        prompt = f'# language: {lang}\n{prompt}'
         logger = get_logger()
         tokenizer = self.tokenizer
         model = self.model
@@ -83,8 +83,7 @@ class CodeGeeXForCodeGeneration(TorchModel):
                 topk=1,
                 topp=0.9,
                 temperature=0.9,
-                greedy=True
-            )
+                greedy=True)
             is_finished = [False for _ in range(micro_batch_size)]
             for i, generated in enumerate(token_stream):
                 generated_tokens = generated[0]
diff --git a/modelscope/pipelines/nlp/codegeex_code_generation_pipeline.py b/modelscope/pipelines/nlp/codegeex_code_generation_pipeline.py
index 2eaebca3..f23461b1 100755
--- a/modelscope/pipelines/nlp/codegeex_code_generation_pipeline.py
+++ b/modelscope/pipelines/nlp/codegeex_code_generation_pipeline.py
@@ -21,7 +21,7 @@ class CodeGeeXCodeGenerationPipeline(Pipeline):
                  *args,
                  **kwargs):
         model = CodeGeeXForCodeGeneration(model) if isinstance(model,
-                                                                str) else model
+                                                               str) else model
         self.model = model
         self.model.eval()
         self.model.half()
@@ -38,8 +38,15 @@ class CodeGeeXCodeGenerationPipeline(Pipeline):
         for para in ['prompt', 'language']:
             if para not in inputs:
                 raise Exception('Please check your input format.')
-        if inputs['language'] not in ["C++","C","C#","Cuda","Objective-C","Objective-C++","Python","Java","Scala","TeX","HTML","PHP","JavaScript","TypeScript","Go","Shell","Rust","CSS","SQL","Kotlin","Pascal","R","Fortran","Lean"]: # noqa
-            raise Exception('Make sure the language is in ["C++","C","C#","Cuda","Objective-C","Objective-C++","Python","Java","Scala","TeX","HTML","PHP","JavaScript","TypeScript","Go","Shell","Rust","CSS","SQL","Kotlin","Pascal","R","Fortran","Lean"]') # noqa
+        if inputs['language'] not in [
+                'C++', 'C', 'C#', 'Cuda', 'Objective-C', 'Objective-C++',
+                'Python', 'Java', 'Scala', 'TeX', 'HTML', 'PHP', 'JavaScript',
+                'TypeScript', 'Go', 'Shell', 'Rust', 'CSS', 'SQL', 'Kotlin',
+                'Pascal', 'R', 'Fortran', 'Lean'
+        ]:  # noqa
+            raise Exception(
+                'Make sure the language is in ["C++","C","C#","Cuda","Objective-C","Objective-C++","Python","Java","Scala","TeX","HTML","PHP","JavaScript","TypeScript","Go","Shell","Rust","CSS","SQL","Kotlin","Pascal","R","Fortran","Lean"]'  # noqa
+            )  # noqa
 
         return self.model(inputs)
 
diff --git a/modelscope/pipelines/nlp/codegeex_code_translation_pipeline.py b/modelscope/pipelines/nlp/codegeex_code_translation_pipeline.py
index 61be5620..8bd5a6da 100755
--- a/modelscope/pipelines/nlp/codegeex_code_translation_pipeline.py
+++ b/modelscope/pipelines/nlp/codegeex_code_translation_pipeline.py
@@ -38,11 +38,25 @@ class CodeGeeXCodeTranslationPipeline(Pipeline):
         for para in ['prompt', 'source language', 'target language']:
             if para not in inputs:
                 raise Exception('please check your input format.')
-        if inputs['source language'] not in ["C++","C","C#","Cuda","Objective-C","Objective-C++","Python","Java","Scala","TeX","HTML","PHP","JavaScript","TypeScript","Go","Shell","Rust","CSS","SQL","Kotlin","Pascal","R","Fortran","Lean"]: # noqa
-            raise Exception('Make sure the source language is in ["C++","C","C#","Cuda","Objective-C","Objective-C++","Python","Java","Scala","TeX","HTML","PHP","JavaScript","TypeScript","Go","Shell","Rust","CSS","SQL","Kotlin","Pascal","R","Fortran","Lean"]') # noqa
+        if inputs['source language'] not in [
+                'C++', 'C', 'C#', 'Cuda', 'Objective-C', 'Objective-C++',
+                'Python', 'Java', 'Scala', 'TeX', 'HTML', 'PHP', 'JavaScript',
+                'TypeScript', 'Go', 'Shell', 'Rust', 'CSS', 'SQL', 'Kotlin',
+                'Pascal', 'R', 'Fortran', 'Lean'
+        ]:
+            raise Exception(
+                'Make sure the source language is in ["C++","C","C#","Cuda","Objective-C","Objective-C++","Python","Java","Scala","TeX","HTML","PHP","JavaScript","TypeScript","Go","Shell","Rust","CSS","SQL","Kotlin","Pascal","R","Fortran","Lean"]'  # noqa
+            )  # noqa
 
-        if inputs['target language'] not in ["C++","C","C#","Cuda","Objective-C","Objective-C++","Python","Java","Scala","TeX","HTML","PHP","JavaScript","TypeScript","Go","Shell","Rust","CSS","SQL","Kotlin","Pascal","R","Fortran","Lean"]: # noqa
-            raise Exception('Make sure the target language is in ["C++","C","C#","Cuda","Objective-C","Objective-C++","Python","Java","Scala","TeX","HTML","PHP","JavaScript","TypeScript","Go","Shell","Rust","CSS","SQL","Kotlin","Pascal","R","Fortran","Lean"]') # noqa
+        if inputs['target language'] not in [
+                'C++', 'C', 'C#', 'Cuda', 'Objective-C', 'Objective-C++',
+                'Python', 'Java', 'Scala', 'TeX', 'HTML', 'PHP', 'JavaScript',
+                'TypeScript', 'Go', 'Shell', 'Rust', 'CSS', 'SQL', 'Kotlin',
+                'Pascal', 'R', 'Fortran', 'Lean'
+        ]:
+            raise Exception(
+                'Make sure the target language is in ["C++","C","C#","Cuda","Objective-C","Objective-C++","Python","Java","Scala","TeX","HTML","PHP","JavaScript","TypeScript","Go","Shell","Rust","CSS","SQL","Kotlin","Pascal","R","Fortran","Lean"]'  # noqa
+            )  # noqa
 
         return self.model(inputs)