diff --git a/.dev_scripts/ci_container_test.sh b/.dev_scripts/ci_container_test.sh
index 4fd2778f..81c28513 100644
--- a/.dev_scripts/ci_container_test.sh
+++ b/.dev_scripts/ci_container_test.sh
@@ -2,13 +2,7 @@ echo "Testing envs"
 printenv
 echo "ENV END"
 if [ "$MODELSCOPE_SDK_DEBUG" == "True" ]; then
-    awk -F: '/^[^#]/ { print $1 }' requirements/framework.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-    awk -F: '/^[^#]/ { print $1 }' requirements/audio.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-    awk -F: '/^[^#]/ { print $1 }' requirements/cv.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-    awk -F: '/^[^#]/ { print $1 }' requirements/multi-modal.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-    awk -F: '/^[^#]/ { print $1 }' requirements/nlp.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
     pip install -r requirements/tests.txt
-
     git config --global --add safe.directory /Maas-lib
     git config --global user.email tmp
     git config --global user.name tmp.com
@@ -19,9 +13,22 @@ if [ "$MODELSCOPE_SDK_DEBUG" == "True" ]; then
         pre-commit run -c .pre-commit-config_local.yaml --all-files
         if [ $? -ne 0 ]; then
             echo "linter test failed, please run 'pre-commit run --all-files' to check"
+            echo "From the repository folder"
+            echo "Run 'pip install -r requirements/tests.txt' install test dependencies."
+            echo "Run 'pre-commit install' install pre-commit hooks."
+            echo "Finally run linter with command: 'pre-commit run --all-files' to check."
+            echo "Ensure there is no failure!!!!!!!!"
             exit -1
         fi
     fi
+
+    awk -F: '/^[^#]/ { print $1 }' requirements/framework.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
+    awk -F: '/^[^#]/ { print $1 }' requirements/audio.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
+    awk -F: '/^[^#]/ { print $1 }' requirements/cv.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
+    awk -F: '/^[^#]/ { print $1 }' requirements/multi-modal.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
+    awk -F: '/^[^#]/ { print $1 }' requirements/nlp.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
+    awk -F: '/^[^#]/ { print $1 }' requirements/science.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
+     pip install -r requirements/tests.txt
     # test with install
     python setup.py install
 else
diff --git a/docker/Dockerfile.ubuntu b/docker/Dockerfile.ubuntu
index 6dafbc3e..160e2604 100644
--- a/docker/Dockerfile.ubuntu
+++ b/docker/Dockerfile.ubuntu
@@ -7,6 +7,7 @@ ENV PATH="${CONDA_DIR}/bin:${PATH}"
 ENV arch=x86_64
 SHELL ["/bin/bash", "-c"]
 COPY docker/rcfiles /tmp/resources
+COPY docker/jupyter_plugins /tmp/resources/jupyter_plugins
 RUN apt-get update && apt-get install -y --reinstall ca-certificates && \
     cp /tmp/resources/ubuntu20.04_sources.tuna /etc/apt/sources.list && \
     apt-get update && \
@@ -26,7 +27,7 @@ ENV LANG=zh_CN.UTF-8 LANGUAGE=zh_CN.UTF-8 LC_ALL=zh_CN.UTF-8
 
 #install and config python
 ARG PYTHON_VERSION=3.7.13
-RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-${arch}.sh -O ./miniconda.sh && \
+RUN wget --quiet https://mirrors.aliyun.com/anaconda/miniconda/Miniconda3-latest-Linux-${arch}.sh -O ./miniconda.sh && \
     /bin/bash  miniconda.sh -b -p /opt/conda && \
     rm  -f miniconda.sh && \
     ln  -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \
@@ -34,8 +35,8 @@ RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-${a
     cp /tmp/resources/conda.tuna  ~/.condarc && \
     source /root/.bashrc && \
     conda install --yes python==${PYTHON_VERSION} && \
-    pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple && \
-    pip config set install.trusted-host pypi.tuna.tsinghua.edu.cn
+    pip config set global.index-url https://mirrors.aliyun.com/pypi/simple && \
+    pip config set install.trusted-host mirrors.aliyun.com
 
 ARG USE_GPU=True
 
@@ -70,16 +71,38 @@ RUN pip install --no-cache-dir --upgrade pip && \
     pip install --no-cache-dir -r /var/modelscope/cv.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
     pip install --no-cache-dir -r /var/modelscope/multi-modal.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
     pip install --no-cache-dir -r /var/modelscope/nlp.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
+    pip install --no-cache-dir -r /var/modelscope/science.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
     pip cache purge
 
 # default shell bash
 ENV SHELL=/bin/bash
 
 # install special package
-RUN pip install --no-cache-dir mmcls>=0.21.0 mmdet>=2.25.0 decord>=0.6.0 datasets==2.1.0 numpy==1.18.5 ipykernel fairseq fasttext https://modelscope.oss-cn-beijing.aliyuncs.com/releases/dependencies/xtcocotools-1.12-cp37-cp37m-linux_x86_64.whl
-
 RUN if [ "$USE_GPU" = "True" ] ; then \
         pip install --no-cache-dir dgl-cu113 dglgo -f https://data.dgl.ai/wheels/repo.html; \
     else \
         pip install --no-cache-dir dgl dglgo -f https://data.dgl.ai/wheels/repo.html; \
     fi
+
+# install  jupyter plugin
+RUN mkdir -p /root/.local/share/jupyter/labextensions/ && \
+    cp -r  /tmp/resources/jupyter_plugins/*  /root/.local/share/jupyter/labextensions/
+
+COPY docker/scripts/modelscope_env_init.sh /usr/local/bin/ms_env_init.sh
+RUN pip install --no-cache-dir https://modelscope.oss-cn-beijing.aliyuncs.com/releases/dependencies/xtcocotools-1.12-cp37-cp37m-linux_x86_64.whl --force
+
+# for uniford
+COPY docker/scripts/install_unifold.sh /tmp/install_unifold.sh
+RUN if [ "$USE_GPU" = "True" ] ; then \
+        bash /tmp/install_unifold.sh; \
+    else \
+     echo 'cpu unsupport uniford'; \
+    fi
+
+RUN pip install --no-cache-dir mmcls>=0.21.0 mmdet>=2.25.0 decord>=0.6.0 datasets==2.1.0 numpy==1.18.5 ipykernel fairseq fasttext deepspeed
+COPY docker/scripts/install_apex.sh /tmp/install_apex.sh
+RUN if [ "$USE_GPU" = "True" ] ; then \
+        bash /tmp/install_apex.sh; \
+    else \
+     echo 'cpu unsupport uniford'; \
+    fi
diff --git a/docker/jupyter_plugins/jupyterlab_active_log/package.json b/docker/jupyter_plugins/jupyterlab_active_log/package.json
new file mode 100644
index 00000000..d2e0d0db
--- /dev/null
+++ b/docker/jupyter_plugins/jupyterlab_active_log/package.json
@@ -0,0 +1,99 @@
+{
+  "name": "jupyterlab_active_log",
+  "version": "0.1.0",
+  "description": "A JupyterLab extension.",
+  "keywords": [
+    "jupyter",
+    "jupyterlab",
+    "jupyterlab-extension"
+  ],
+  "homepage": "https://github.com/github_username/jupyterlab_active_log",
+  "bugs": {
+    "url": "https://github.com/github_username/jupyterlab_active_log/issues"
+  },
+  "license": "BSD-3-Clause",
+  "files": [
+    "lib/**/*.{d.ts,eot,gif,html,jpg,js,js.map,json,png,svg,woff2,ttf}",
+    "style/**/*.{css,js,eot,gif,html,jpg,json,png,svg,woff2,ttf}"
+  ],
+  "main": "lib/index.js",
+  "types": "lib/index.d.ts",
+  "style": "style/index.css",
+  "repository": {
+    "type": "git",
+    "url": "https://github.com/github_username/jupyterlab_active_log.git"
+  },
+  "scripts": {
+    "build": "jlpm build:lib && jlpm build:labextension:dev",
+    "build:prod": "jlpm clean && jlpm build:lib && jlpm build:labextension",
+    "build:labextension": "jupyter labextension build .",
+    "build:labextension:dev": "jupyter labextension build --development True .",
+    "build:lib": "tsc",
+    "clean": "jlpm clean:lib",
+    "clean:lib": "rimraf lib tsconfig.tsbuildinfo",
+    "clean:lintcache": "rimraf .eslintcache .stylelintcache",
+    "clean:labextension": "rimraf jupyterlab_active_log/labextension",
+    "clean:all": "jlpm clean:lib && jlpm clean:labextension && jlpm clean:lintcache",
+    "eslint": "jlpm eslint:check --fix",
+    "eslint:check": "eslint . --cache --ext .ts,.tsx",
+    "install:extension": "jlpm build",
+    "lint": "jlpm stylelint && jlpm prettier && jlpm eslint",
+    "lint:check": "jlpm stylelint:check && jlpm prettier:check && jlpm eslint:check",
+    "prettier": "jlpm prettier:base --write --list-different",
+    "prettier:base": "prettier \"**/*{.ts,.tsx,.js,.jsx,.css,.json,.md}\"",
+    "prettier:check": "jlpm prettier:base --check",
+    "stylelint": "jlpm stylelint:check --fix",
+    "stylelint:check": "stylelint --cache \"style/**/*.css\"",
+    "watch": "run-p watch:src watch:labextension",
+    "watch:src": "tsc -w",
+    "watch:labextension": "jupyter labextension watch ."
+  },
+  "dependencies": {
+    "@jupyterlab/application": "^3.1.0"
+  },
+  "devDependencies": {
+    "@jupyterlab/builder": "^3.1.0",
+    "@typescript-eslint/eslint-plugin": "^4.8.1",
+    "@typescript-eslint/parser": "^4.8.1",
+    "eslint": "^7.14.0",
+    "eslint-config-prettier": "^6.15.0",
+    "eslint-plugin-prettier": "^3.1.4",
+    "npm-run-all": "^4.1.5",
+    "prettier": "^2.1.1",
+    "rimraf": "^3.0.2",
+    "stylelint": "^14.3.0",
+    "stylelint-config-prettier": "^9.0.3",
+    "stylelint-config-recommended": "^6.0.0",
+    "stylelint-config-standard": "~24.0.0",
+    "stylelint-prettier": "^2.0.0",
+    "typescript": "~4.1.3"
+  },
+  "sideEffects": [
+    "style/*.css",
+    "style/index.js"
+  ],
+  "styleModule": "style/index.js",
+  "publishConfig": {
+    "access": "public"
+  },
+  "jupyterlab": {
+    "extension": true,
+    "outputDir": "jupyterlab_active_log/labextension",
+    "_build": {
+      "load": "static/remoteEntry.eb3177c3791d7658cc12.js",
+      "extension": "./extension",
+      "style": "./style"
+    }
+  },
+  "jupyter-releaser": {
+    "hooks": {
+      "before-build-npm": [
+        "python -m pip install jupyterlab~=3.1",
+        "jlpm"
+      ],
+      "before-build-python": [
+        "jlpm clean:all"
+      ]
+    }
+  }
+}
diff --git a/docker/jupyter_plugins/jupyterlab_active_log/static/568.a92ae44b87625ab09aed.js b/docker/jupyter_plugins/jupyterlab_active_log/static/568.a92ae44b87625ab09aed.js
new file mode 100644
index 00000000..b70adee6
--- /dev/null
+++ b/docker/jupyter_plugins/jupyterlab_active_log/static/568.a92ae44b87625ab09aed.js
@@ -0,0 +1 @@
+"use strict";(self.webpackChunkjupyterlab_active_log=self.webpackChunkjupyterlab_active_log||[]).push([[568],{568:(t,e,a)=>{a.r(e),a.d(e,{default:()=>i});const i={id:"jupyterlab_active_log:plugin",autoStart:!0,activate:t=>{console.log("JupyterLab extension jupyterlab_active_log is activated!"),window.consts=Object.assign(Object.assign({},window.consts),{recordUrl:"https://modelscope.cn/api/v1/notebooks/activelog",timerDuration:1e4,timerParams:function(){const t=location.pathname.split("/");let e;return t.length>=2&&(e=t[1]),{site:"dsw",id:e,ext:{pathname:location.pathname}}}});const e=document.body,a=e.insertBefore(document.createElement("script"),e.firstChild);a.setAttribute("id","timer-sdk"),a.setAttribute("src","https://g.alicdn.com/alifanyi/translate-js-sdk/timer.js  ")}}}}]);
diff --git a/docker/jupyter_plugins/jupyterlab_active_log/static/747.63b4c3d22bfe458b352b.js b/docker/jupyter_plugins/jupyterlab_active_log/static/747.63b4c3d22bfe458b352b.js
new file mode 100644
index 00000000..2129fc3d
--- /dev/null
+++ b/docker/jupyter_plugins/jupyterlab_active_log/static/747.63b4c3d22bfe458b352b.js
@@ -0,0 +1 @@
+"use strict";(self.webpackChunkjupyterlab_active_log=self.webpackChunkjupyterlab_active_log||[]).push([[747],{150:(e,n,t)=>{t.d(n,{Z:()=>a});var r=t(645),o=t.n(r)()((function(e){return e[1]}));o.push([e.id,"/*\n    See the JupyterLab Developer Guide for useful CSS Patterns:\n\n    https://jupyterlab.readthedocs.io/en/stable/developer/css.html\n*/\n",""]);const a=o},645:e=>{e.exports=function(e){var n=[];return n.toString=function(){return this.map((function(n){var t=e(n);return n[2]?"@media ".concat(n[2]," {").concat(t,"}"):t})).join("")},n.i=function(e,t,r){"string"==typeof e&&(e=[[null,e,""]]);var o={};if(r)for(var a=0;a<this.length;a++){var i=this[a][0];null!=i&&(o[i]=!0)}for(var c=0;c<e.length;c++){var s=[].concat(e[c]);r&&o[s[0]]||(t&&(s[2]?s[2]="".concat(t," and ").concat(s[2]):s[2]=t),n.push(s))}},n}},379:(e,n,t)=>{var r,o=function(){var e={};return function(n){if(void 0===e[n]){var t=document.querySelector(n);if(window.HTMLIFrameElement&&t instanceof window.HTMLIFrameElement)try{t=t.contentDocument.head}catch(e){t=null}e[n]=t}return e[n]}}(),a=[];function i(e){for(var n=-1,t=0;t<a.length;t++)if(a[t].identifier===e){n=t;break}return n}function c(e,n){for(var t={},r=[],o=0;o<e.length;o++){var c=e[o],s=n.base?c[0]+n.base:c[0],u=t[s]||0,l="".concat(s," ").concat(u);t[s]=u+1;var f=i(l),d={css:c[1],media:c[2],sourceMap:c[3]};-1!==f?(a[f].references++,a[f].updater(d)):a.push({identifier:l,updater:v(d,n),references:1}),r.push(l)}return r}function s(e){var n=document.createElement("style"),r=e.attributes||{};if(void 0===r.nonce){var a=t.nc;a&&(r.nonce=a)}if(Object.keys(r).forEach((function(e){n.setAttribute(e,r[e])})),"function"==typeof e.insert)e.insert(n);else{var i=o(e.insert||"head");if(!i)throw new Error("Couldn't find a style target. This probably means that the value for the 'insert' parameter is invalid.");i.appendChild(n)}return n}var u,l=(u=[],function(e,n){return u[e]=n,u.filter(Boolean).join("\n")});function f(e,n,t,r){var o=t?"":r.media?"@media ".concat(r.media," {").concat(r.css,"}"):r.css;if(e.styleSheet)e.styleSheet.cssText=l(n,o);else{var a=document.createTextNode(o),i=e.childNodes;i[n]&&e.removeChild(i[n]),i.length?e.insertBefore(a,i[n]):e.appendChild(a)}}function d(e,n,t){var r=t.css,o=t.media,a=t.sourceMap;if(o?e.setAttribute("media",o):e.removeAttribute("media"),a&&"undefined"!=typeof btoa&&(r+="\n/*# sourceMappingURL=data:application/json;base64,".concat(btoa(unescape(encodeURIComponent(JSON.stringify(a))))," */")),e.styleSheet)e.styleSheet.cssText=r;else{for(;e.firstChild;)e.removeChild(e.firstChild);e.appendChild(document.createTextNode(r))}}var p=null,h=0;function v(e,n){var t,r,o;if(n.singleton){var a=h++;t=p||(p=s(n)),r=f.bind(null,t,a,!1),o=f.bind(null,t,a,!0)}else t=s(n),r=d.bind(null,t,n),o=function(){!function(e){if(null===e.parentNode)return!1;e.parentNode.removeChild(e)}(t)};return r(e),function(n){if(n){if(n.css===e.css&&n.media===e.media&&n.sourceMap===e.sourceMap)return;r(e=n)}else o()}}e.exports=function(e,n){(n=n||{}).singleton||"boolean"==typeof n.singleton||(n.singleton=(void 0===r&&(r=Boolean(window&&document&&document.all&&!window.atob)),r));var t=c(e=e||[],n);return function(e){if(e=e||[],"[object Array]"===Object.prototype.toString.call(e)){for(var r=0;r<t.length;r++){var o=i(t[r]);a[o].references--}for(var s=c(e,n),u=0;u<t.length;u++){var l=i(t[u]);0===a[l].references&&(a[l].updater(),a.splice(l,1))}t=s}}}},747:(e,n,t)=>{t.r(n);var r=t(379),o=t.n(r),a=t(150);o()(a.Z,{insert:"head",singleton:!1}),a.Z.locals}}]);
diff --git a/docker/jupyter_plugins/jupyterlab_active_log/static/remoteEntry.eb3177c3791d7658cc12.js b/docker/jupyter_plugins/jupyterlab_active_log/static/remoteEntry.eb3177c3791d7658cc12.js
new file mode 100644
index 00000000..ec49e973
--- /dev/null
+++ b/docker/jupyter_plugins/jupyterlab_active_log/static/remoteEntry.eb3177c3791d7658cc12.js
@@ -0,0 +1 @@
+var _JUPYTERLAB;(()=>{"use strict";var e,r,t={293:(e,r,t)=>{var o={"./index":()=>t.e(568).then((()=>()=>t(568))),"./extension":()=>t.e(568).then((()=>()=>t(568))),"./style":()=>t.e(747).then((()=>()=>t(747)))},a=(e,r)=>(t.R=r,r=t.o(o,e)?o[e]():Promise.resolve().then((()=>{throw new Error('Module "'+e+'" does not exist in container.')})),t.R=void 0,r),n=(e,r)=>{if(t.S){var o="default",a=t.S[o];if(a&&a!==e)throw new Error("Container initialization failed as it has already been initialized with a different share scope");return t.S[o]=e,t.I(o,r)}};t.d(r,{get:()=>a,init:()=>n})}},o={};function a(e){var r=o[e];if(void 0!==r)return r.exports;var n=o[e]={id:e,exports:{}};return t[e](n,n.exports,a),n.exports}a.m=t,a.c=o,a.n=e=>{var r=e&&e.__esModule?()=>e.default:()=>e;return a.d(r,{a:r}),r},a.d=(e,r)=>{for(var t in r)a.o(r,t)&&!a.o(e,t)&&Object.defineProperty(e,t,{enumerable:!0,get:r[t]})},a.f={},a.e=e=>Promise.all(Object.keys(a.f).reduce(((r,t)=>(a.f[t](e,r),r)),[])),a.u=e=>e+"."+{568:"a92ae44b87625ab09aed",747:"63b4c3d22bfe458b352b"}[e]+".js?v="+{568:"a92ae44b87625ab09aed",747:"63b4c3d22bfe458b352b"}[e],a.g=function(){if("object"==typeof globalThis)return globalThis;try{return this||new Function("return this")()}catch(e){if("object"==typeof window)return window}}(),a.o=(e,r)=>Object.prototype.hasOwnProperty.call(e,r),e={},r="jupyterlab_active_log:",a.l=(t,o,n,i)=>{if(e[t])e[t].push(o);else{var l,u;if(void 0!==n)for(var c=document.getElementsByTagName("script"),d=0;d<c.length;d++){var s=c[d];if(s.getAttribute("src")==t||s.getAttribute("data-webpack")==r+n){l=s;break}}l||(u=!0,(l=document.createElement("script")).charset="utf-8",l.timeout=120,a.nc&&l.setAttribute("nonce",a.nc),l.setAttribute("data-webpack",r+n),l.src=t),e[t]=[o];var p=(r,o)=>{l.onerror=l.onload=null,clearTimeout(f);var a=e[t];if(delete e[t],l.parentNode&&l.parentNode.removeChild(l),a&&a.forEach((e=>e(o))),r)return r(o)},f=setTimeout(p.bind(null,void 0,{type:"timeout",target:l}),12e4);l.onerror=p.bind(null,l.onerror),l.onload=p.bind(null,l.onload),u&&document.head.appendChild(l)}},a.r=e=>{"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},(()=>{a.S={};var e={},r={};a.I=(t,o)=>{o||(o=[]);var n=r[t];if(n||(n=r[t]={}),!(o.indexOf(n)>=0)){if(o.push(n),e[t])return e[t];a.o(a.S,t)||(a.S[t]={});var i=a.S[t],l="jupyterlab_active_log",u=[];return"default"===t&&((e,r,t,o)=>{var n=i[e]=i[e]||{},u=n[r];(!u||!u.loaded&&(1!=!u.eager?o:l>u.from))&&(n[r]={get:()=>a.e(568).then((()=>()=>a(568))),from:l,eager:!1})})("jupyterlab_active_log","0.1.0"),e[t]=u.length?Promise.all(u).then((()=>e[t]=1)):1}}})(),(()=>{var e;a.g.importScripts&&(e=a.g.location+"");var r=a.g.document;if(!e&&r&&(r.currentScript&&(e=r.currentScript.src),!e)){var t=r.getElementsByTagName("script");t.length&&(e=t[t.length-1].src)}if(!e)throw new Error("Automatic publicPath is not supported in this browser");e=e.replace(/#.*$/,"").replace(/\?.*$/,"").replace(/\/[^\/]+$/,"/"),a.p=e})(),(()=>{var e={346:0};a.f.j=(r,t)=>{var o=a.o(e,r)?e[r]:void 0;if(0!==o)if(o)t.push(o[2]);else{var n=new Promise(((t,a)=>o=e[r]=[t,a]));t.push(o[2]=n);var i=a.p+a.u(r),l=new Error;a.l(i,(t=>{if(a.o(e,r)&&(0!==(o=e[r])&&(e[r]=void 0),o)){var n=t&&("load"===t.type?"missing":t.type),i=t&&t.target&&t.target.src;l.message="Loading chunk "+r+" failed.\n("+n+": "+i+")",l.name="ChunkLoadError",l.type=n,l.request=i,o[1](l)}}),"chunk-"+r,r)}};var r=(r,t)=>{var o,n,[i,l,u]=t,c=0;if(i.some((r=>0!==e[r]))){for(o in l)a.o(l,o)&&(a.m[o]=l[o]);u&&u(a)}for(r&&r(t);c<i.length;c++)n=i[c],a.o(e,n)&&e[n]&&e[n][0](),e[n]=0},t=self.webpackChunkjupyterlab_active_log=self.webpackChunkjupyterlab_active_log||[];t.forEach(r.bind(null,0)),t.push=r.bind(null,t.push.bind(t))})(),a.nc=void 0;var n=a(293);(_JUPYTERLAB=void 0===_JUPYTERLAB?{}:_JUPYTERLAB).jupyterlab_active_log=n})();
diff --git a/docker/jupyter_plugins/jupyterlab_active_log/static/style.js b/docker/jupyter_plugins/jupyterlab_active_log/static/style.js
new file mode 100644
index 00000000..2bc05dab
--- /dev/null
+++ b/docker/jupyter_plugins/jupyterlab_active_log/static/style.js
@@ -0,0 +1,4 @@
+/* This is a generated file of CSS imports */
+/* It was generated by @jupyterlab/builder in Build.ensureAssets() */
+
+import 'jupyterlab_active_log/style/index.js';
diff --git a/docker/jupyter_plugins/jupyterlab_active_log/static/third-party-licenses.json b/docker/jupyter_plugins/jupyterlab_active_log/static/third-party-licenses.json
new file mode 100644
index 00000000..78bcbda6
--- /dev/null
+++ b/docker/jupyter_plugins/jupyterlab_active_log/static/third-party-licenses.json
@@ -0,0 +1,16 @@
+{
+  "packages": [
+    {
+      "name": "css-loader",
+      "versionInfo": "5.2.7",
+      "licenseId": "MIT",
+      "extractedText": "Copyright JS Foundation and other contributors\n\nPermission is hereby granted, free of charge, to any person obtaining\na copy of this software and associated documentation files (the\n'Software'), to deal in the Software without restriction, including\nwithout limitation the rights to use, copy, modify, merge, publish,\ndistribute, sublicense, and/or sell copies of the Software, and to\npermit persons to whom the Software is furnished to do so, subject to\nthe following conditions:\n\nThe above copyright notice and this permission notice shall be\nincluded in all copies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,\nEXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\nMERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.\nIN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY\nCLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,\nTORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE\nSOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n"
+    },
+    {
+      "name": "style-loader",
+      "versionInfo": "2.0.0",
+      "licenseId": "MIT",
+      "extractedText": "Copyright JS Foundation and other contributors\n\nPermission is hereby granted, free of charge, to any person obtaining\na copy of this software and associated documentation files (the\n'Software'), to deal in the Software without restriction, including\nwithout limitation the rights to use, copy, modify, merge, publish,\ndistribute, sublicense, and/or sell copies of the Software, and to\npermit persons to whom the Software is furnished to do so, subject to\nthe following conditions:\n\nThe above copyright notice and this permission notice shall be\nincluded in all copies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,\nEXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\nMERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.\nIN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY\nCLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,\nTORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE\nSOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n"
+    }
+  ]
+}
diff --git a/docker/scripts/install_apex.sh b/docker/scripts/install_apex.sh
new file mode 100644
index 00000000..5a734243
--- /dev/null
+++ b/docker/scripts/install_apex.sh
@@ -0,0 +1,5 @@
+git clone https://github.com/NVIDIA/apex
+cd apex
+TORCH_CUDA_ARCH_LIST="6.0;6.1;6.2;7.0;7.5;8.0;8.6" pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
+cd ..
+rm -rf apex
diff --git a/docker/scripts/install_unifold.sh b/docker/scripts/install_unifold.sh
new file mode 100644
index 00000000..0e9f3682
--- /dev/null
+++ b/docker/scripts/install_unifold.sh
@@ -0,0 +1,12 @@
+apt-get update && apt-get install -y  hmmer kalign curl cmake \
+        && apt-get clean && rm -rf /var/lib/apt/lists/* \
+        && git clone --branch v3.3.0 https://github.com/soedinglab/hh-suite.git /tmp/hh-suite \
+        && mkdir /tmp/hh-suite/build \
+        && pushd /tmp/hh-suite/build \
+        && cmake -DCMAKE_INSTALL_PREFIX=/opt/hhsuite .. \
+        && make -j 4 && make install \
+        && ln -s /opt/hhsuite/bin/* /usr/bin \
+        && popd \
+        && rm -rf /tmp/hh-suite \
+        && pip install --no-cache-dir unicore -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html \
+        && pip install --no-cache-dir  biopython ipdb
diff --git a/docker/scripts/modelscope_env_init.sh b/docker/scripts/modelscope_env_init.sh
new file mode 100755
index 00000000..3f701d7c
--- /dev/null
+++ b/docker/scripts/modelscope_env_init.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+set -e
+set -o pipefail
+# chieck git is install
+git --version >/dev/null 2>&1 || { echo 'git not installed' ; exit 0; }
+
+if [ -z "$MODELSCOPE_USERNAME" ]  || [ -z "$MODELSCOPE_GITLAB_ACCESS_TOKEN" ]; then
+    :
+else
+    git config --global credential.helper store
+    echo "http://${MODELSCOPE_USERNAME}:${MODELSCOPE_GITLAB_ACCESS_TOKEN}@www.modelscope.cn">~/.git-credentials
+    echo "https://${MODELSCOPE_USERNAME}:${MODELSCOPE_GITLAB_ACCESS_TOKEN}@www.modelscope.cn">>~/.git-credentials
+    chmod go-rwx ~/.git-credentials
+fi
+if [ -z "$MODELSCOPE_USERNAME" ]  || [ -z "$MODELSCOPE_USEREMAIL" ]; then
+    :
+else
+    git config --system user.name ${MODELSCOPE_USERNAME}
+    git config --system user.email ${MODELSCOPE_USEREMAIL}
+fi
+if [ -z "$MODELSCOPE_ENVIRONMENT" ]; then
+    :
+else
+    git config --system --add http.http://www.modelscope.cn.extraHeader "Modelscope_Environment: $MODELSCOPE_ENVIRONMENT"
+    git config --system --add http.https://www.modelscope.cn.extraHeader "Modelscope_Environment: $MODELSCOPE_ENVIRONMENT"
+fi
+
+if [ -z "$MODELSCOPE_USERNAME" ]; then
+    :
+else
+    git config --system --add http.http://www.modelscope.cn.extraHeader "Modelscope_User: $MODELSCOPE_USERNAME"
+    git config --system --add http.https://www.modelscope.cn.extraHeader "Modelscope_User: $MODELSCOPE_USERNAME"
+fi
+
+if [ -z "$MODELSCOPE_USERID" ]; then
+    :
+else
+    git config --system --add http.http://www.modelscope.cn.extraHeader "Modelscope_Userid: $MODELSCOPE_USERID"
+    git config --system --add http.https://www.modelscope.cn.extraHeader "Modelscope_Userid: $MODELSCOPE_USERID"
+fi
+
+if [ -z "$MODELSCOPE_HAVANAID" ]; then
+    :
+else
+    git config --system --add http.http://www.modelscope.cn.extraHeader "Modelscope_Havanaid: $MODELSCOPE_HAVANAID"
+    git config --system --add http.https://www.modelscope.cn.extraHeader "Modelscope_Havanaid: $MODELSCOPE_HAVANAID"
+fi
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 39e0d881..4371c927 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -25,7 +25,7 @@ version_file = '../../modelscope/version.py'
 
 
 def get_version():
-    with open(version_file, 'r') as f:
+    with open(version_file, 'r', encoding='utf-8') as f:
         exec(compile(f.read(), version_file, 'exec'))
     return locals()['__version__']
 
diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py
index f2ff822d..17c21d44 100644
--- a/modelscope/hub/api.py
+++ b/modelscope/hub/api.py
@@ -739,7 +739,7 @@ class ModelScopeConfig:
             with open(
                     os.path.join(ModelScopeConfig.path_credential,
                                  ModelScopeConfig.USER_INFO_FILE_NAME),
-                    'r') as f:
+                    'r', encoding='utf-8') as f:
                 info = f.read()
                 return info.split(':')[0], info.split(':')[1]
         except FileNotFoundError:
@@ -760,7 +760,7 @@ class ModelScopeConfig:
             with open(
                     os.path.join(ModelScopeConfig.path_credential,
                                  ModelScopeConfig.GIT_TOKEN_FILE_NAME),
-                    'r') as f:
+                    'r', encoding='utf-8') as f:
                 token = f.read()
         except FileNotFoundError:
             pass
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index c7c3e729..ccd36349 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -32,6 +32,7 @@ class Models(object):
     image_reid_person = 'passvitb'
     image_inpainting = 'FFTInpainting'
     video_summarization = 'pgl-video-summarization'
+    language_guided_video_summarization = 'clip-it-language-guided-video-summarization'
     swinL_semantic_segmentation = 'swinL-semantic-segmentation'
     vitadapter_semantic_segmentation = 'vitadapter-semantic-segmentation'
     text_driven_segmentation = 'text-driven-segmentation'
@@ -200,6 +201,7 @@ class Pipelines(object):
     video_single_object_tracking = 'ostrack-vitb-video-single-object-tracking'
     image_panoptic_segmentation = 'image-panoptic-segmentation'
     video_summarization = 'googlenet_pgl_video_summarization'
+    language_guided_video_summarization = 'clip-it-video-summarization'
     image_semantic_segmentation = 'image-semantic-segmentation'
     image_reid_person = 'passvitb-image-reid-person'
     image_inpainting = 'fft-inpainting'
diff --git a/modelscope/models/audio/tts/models/datasets/kantts_data4fs.py b/modelscope/models/audio/tts/models/datasets/kantts_data4fs.py
index cc47d0c4..9378c32a 100644
--- a/modelscope/models/audio/tts/models/datasets/kantts_data4fs.py
+++ b/modelscope/models/audio/tts/models/datasets/kantts_data4fs.py
@@ -21,7 +21,7 @@ class KanTtsText2MelDataset(Dataset):
 
         self.cache = cache
 
-        with open(config_filename) as f:
+        with open(config_filename, encoding='utf-8') as f:
             self._config = json.loads(f.read())
 
         # Load metadata:
diff --git a/modelscope/models/audio/tts/sambert_hifi.py b/modelscope/models/audio/tts/sambert_hifi.py
index a9b55795..9a14219e 100644
--- a/modelscope/models/audio/tts/sambert_hifi.py
+++ b/modelscope/models/audio/tts/sambert_hifi.py
@@ -60,7 +60,7 @@ class SambertHifigan(Model):
         with zipfile.ZipFile(zip_file, 'r') as zip_ref:
             zip_ref.extractall(model_dir)
         voice_cfg_path = os.path.join(self.__voice_path, 'voices.json')
-        with open(voice_cfg_path, 'r') as f:
+        with open(voice_cfg_path, 'r', encoding='utf-8') as f:
             voice_cfg = json.load(f)
         if 'voices' not in voice_cfg:
             raise TtsModelConfigurationException(
diff --git a/modelscope/models/cv/__init__.py b/modelscope/models/cv/__init__.py
index 64039863..de972032 100644
--- a/modelscope/models/cv/__init__.py
+++ b/modelscope/models/cv/__init__.py
@@ -10,10 +10,10 @@ from . import (action_recognition, animal_recognition, body_2d_keypoints,
                image_panoptic_segmentation, image_portrait_enhancement,
                image_reid_person, image_semantic_segmentation,
                image_to_image_generation, image_to_image_translation,
-               movie_scene_segmentation, object_detection,
-               product_retrieval_embedding, realtime_object_detection,
-               referring_video_object_segmentation, salient_detection,
-               shop_segmentation, super_resolution,
+               language_guided_video_summarization, movie_scene_segmentation,
+               object_detection, product_retrieval_embedding,
+               realtime_object_detection, referring_video_object_segmentation,
+               salient_detection, shop_segmentation, super_resolution,
                video_single_object_tracking, video_summarization, virual_tryon)
 
 # yapf: enable
diff --git a/modelscope/models/cv/language_guided_video_summarization/__init__.py b/modelscope/models/cv/language_guided_video_summarization/__init__.py
new file mode 100755
index 00000000..73f7bd03
--- /dev/null
+++ b/modelscope/models/cv/language_guided_video_summarization/__init__.py
@@ -0,0 +1,25 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .summarizer import (
+        ClipItVideoSummarization, )
+
+else:
+    _import_structure = {
+        'summarizer': [
+            'ClipItVideoSummarization',
+        ]
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/language_guided_video_summarization/summarizer.py b/modelscope/models/cv/language_guided_video_summarization/summarizer.py
new file mode 100755
index 00000000..654dc3ea
--- /dev/null
+++ b/modelscope/models/cv/language_guided_video_summarization/summarizer.py
@@ -0,0 +1,194 @@
+# Part of the implementation is borrowed and modified from BMT and video_features,
+# publicly available at https://github.com/v-iashin/BMT
+# and https://github.com/v-iashin/video_features
+
+import argparse
+import os
+import os.path as osp
+from copy import deepcopy
+from typing import Dict, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+from bmt_clipit.sample.single_video_prediction import (caption_proposals,
+                                                       generate_proposals,
+                                                       load_cap_model,
+                                                       load_prop_model)
+from bmt_clipit.utilities.proposal_utils import non_max_suppresion
+from torch.nn.parallel import DataParallel, DistributedDataParallel
+from videofeatures_clipit.models.i3d.extract_i3d import ExtractI3D
+from videofeatures_clipit.models.vggish.extract_vggish import ExtractVGGish
+from videofeatures_clipit.utils.utils import (fix_tensorflow_gpu_allocation,
+                                              form_list_from_user_input)
+
+from modelscope.metainfo import Models
+from modelscope.models.base import Tensor, TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.models.cv.language_guided_video_summarization.transformer import \
+    Transformer
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+def extract_text(args):
+    # Loading models and other essential stuff
+    cap_cfg, cap_model, train_dataset = load_cap_model(
+        args.pretrained_cap_model_path, args.device_id)
+    prop_cfg, prop_model = load_prop_model(args.device_id,
+                                           args.prop_generator_model_path,
+                                           args.pretrained_cap_model_path,
+                                           args.max_prop_per_vid)
+    # Proposal
+    proposals = generate_proposals(prop_model, args.features,
+                                   train_dataset.pad_idx, prop_cfg,
+                                   args.device_id, args.duration_in_secs)
+    # NMS if specified
+    if args.nms_tiou_thresh is not None:
+        proposals = non_max_suppresion(proposals.squeeze(),
+                                       args.nms_tiou_thresh)
+        proposals = proposals.unsqueeze(0)
+    # Captions for each proposal
+    captions = caption_proposals(cap_model, args.features, train_dataset,
+                                 cap_cfg, args.device_id, proposals,
+                                 args.duration_in_secs)
+    return captions
+
+
+def extract_video_features(video_path, tmp_path, feature_type, i3d_flow_path,
+                           i3d_rgb_path, kinetics_class_labels, pwc_path,
+                           vggish_model_path, vggish_pca_path, extraction_fps,
+                           device):
+    default_args = dict(
+        device=device,
+        extraction_fps=extraction_fps,
+        feature_type=feature_type,
+        file_with_video_paths=None,
+        i3d_flow_path=i3d_flow_path,
+        i3d_rgb_path=i3d_rgb_path,
+        keep_frames=False,
+        kinetics_class_labels=kinetics_class_labels,
+        min_side_size=256,
+        pwc_path=pwc_path,
+        show_kinetics_pred=False,
+        stack_size=64,
+        step_size=64,
+        tmp_path=tmp_path,
+        vggish_model_path=vggish_model_path,
+        vggish_pca_path=vggish_pca_path,
+    )
+    args = argparse.Namespace(**default_args)
+
+    if args.feature_type == 'i3d':
+        extractor = ExtractI3D(args)
+    elif args.feature_type == 'vggish':
+        extractor = ExtractVGGish(args)
+
+    feats = extractor(video_path)
+    return feats
+
+
+def video_features_to_txt(duration_in_secs, pretrained_cap_model_path,
+                          prop_generator_model_path, features, device_id):
+    default_args = dict(
+        device_id=device_id,
+        duration_in_secs=duration_in_secs,
+        features=features,
+        pretrained_cap_model_path=pretrained_cap_model_path,
+        prop_generator_model_path=prop_generator_model_path,
+        max_prop_per_vid=100,
+        nms_tiou_thresh=0.4,
+    )
+    args = argparse.Namespace(**default_args)
+    txt = extract_text(args)
+    return txt
+
+
+@MODELS.register_module(
+    Tasks.language_guided_video_summarization,
+    module_name=Models.language_guided_video_summarization)
+class ClipItVideoSummarization(TorchModel):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the video summarization model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+        """
+        super().__init__(model_dir, *args, **kwargs)
+
+        model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE)
+
+        self.loss = nn.MSELoss()
+        self.model = Transformer()
+        if torch.cuda.is_available():
+            self._device = torch.device('cuda')
+        else:
+            self._device = torch.device('cpu')
+        self.model = self.model.to(self._device)
+
+        self.model = self.load_pretrained(self.model, model_path)
+
+        if self.training:
+            self.model.train()
+        else:
+            self.model.eval()
+
+    def load_pretrained(self, net, load_path, strict=True, param_key='params'):
+        if isinstance(net, (DataParallel, DistributedDataParallel)):
+            net = net.module
+        load_net = torch.load(
+            load_path, map_location=lambda storage, loc: storage)
+        if param_key is not None:
+            if param_key not in load_net and 'params' in load_net:
+                param_key = 'params'
+                logger.info(
+                    f'Loading: {param_key} does not exist, use params.')
+            if param_key in load_net:
+                load_net = load_net[param_key]
+        logger.info(
+            f'Loading {net.__class__.__name__} model from {load_path}, with param key: [{param_key}].'
+        )
+        # remove unnecessary 'module.'
+        for k, v in deepcopy(load_net).items():
+            if k.startswith('module.'):
+                load_net[k[7:]] = v
+                load_net.pop(k)
+        net.load_state_dict(load_net, strict=strict)
+        logger.info('load model done.')
+        return net
+
+    def _train_forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
+        frame_features = input['frame_features']
+        txt_features = input['txt_features']
+        gtscore = input['gtscore']
+        preds, attn_weights = self.model(frame_features, txt_features,
+                                         frame_features)
+        return {'loss': self.loss(preds, gtscore)}
+
+    def _inference_forward(self, input: Dict[str,
+                                             Tensor]) -> Dict[str, Tensor]:
+        frame_features = input['frame_features']
+        txt_features = input['txt_features']
+        y, dec_output = self.model(frame_features, txt_features,
+                                   frame_features)
+        return {'scores': y}
+
+    def forward(self, input: Dict[str,
+                                  Tensor]) -> Dict[str, Union[list, Tensor]]:
+        """return the result by the model
+
+        Args:
+            input (Dict[str, Tensor]): the preprocessed data
+
+        Returns:
+            Dict[str, Union[list, Tensor]]: results
+        """
+        for key, value in input.items():
+            input[key] = input[key].to(self._device)
+        if self.training:
+            return self._train_forward(input)
+        else:
+            return self._inference_forward(input)
diff --git a/modelscope/models/cv/language_guided_video_summarization/transformer/__init__.py b/modelscope/models/cv/language_guided_video_summarization/transformer/__init__.py
new file mode 100755
index 00000000..68dccccf
--- /dev/null
+++ b/modelscope/models/cv/language_guided_video_summarization/transformer/__init__.py
@@ -0,0 +1,25 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .models import (
+        Transformer, )
+
+else:
+    _import_structure = {
+        'models': [
+            'Transformer',
+        ]
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/language_guided_video_summarization/transformer/layers.py b/modelscope/models/cv/language_guided_video_summarization/transformer/layers.py
new file mode 100755
index 00000000..6782c209
--- /dev/null
+++ b/modelscope/models/cv/language_guided_video_summarization/transformer/layers.py
@@ -0,0 +1,48 @@
+# Part of the implementation is borrowed and modified from attention-is-all-you-need-pytorch,
+# publicly available at https://github.com/jadore801120/attention-is-all-you-need-pytorch
+import torch
+import torch.nn as nn
+
+from .sub_layers import MultiHeadAttention, PositionwiseFeedForward
+
+
+class EncoderLayer(nn.Module):
+    """Compose with two layers"""
+
+    def __init__(self, d_model, d_inner, n_head, d_k, d_v, dropout=0.1):
+        super(EncoderLayer, self).__init__()
+        self.slf_attn = MultiHeadAttention(
+            n_head, d_model, d_k, d_v, dropout=dropout)
+        self.pos_ffn = PositionwiseFeedForward(
+            d_model, d_inner, dropout=dropout)
+
+    def forward(self, enc_input, slf_attn_mask=None):
+        enc_output, enc_slf_attn = self.slf_attn(
+            enc_input, enc_input, enc_input, mask=slf_attn_mask)
+        enc_output = self.pos_ffn(enc_output)
+        return enc_output, enc_slf_attn
+
+
+class DecoderLayer(nn.Module):
+    """Compose with three layers"""
+
+    def __init__(self, d_model, d_inner, n_head, d_k, d_v, dropout=0.1):
+        super(DecoderLayer, self).__init__()
+        self.slf_attn = MultiHeadAttention(
+            n_head, d_model, d_k, d_v, dropout=dropout)
+        self.enc_attn = MultiHeadAttention(
+            n_head, d_model, d_k, d_v, dropout=dropout)
+        self.pos_ffn = PositionwiseFeedForward(
+            d_model, d_inner, dropout=dropout)
+
+    def forward(self,
+                dec_input,
+                enc_output,
+                slf_attn_mask=None,
+                dec_enc_attn_mask=None):
+        dec_output, dec_slf_attn = self.slf_attn(
+            dec_input, dec_input, dec_input, mask=slf_attn_mask)
+        dec_output, dec_enc_attn = self.enc_attn(
+            dec_output, enc_output, enc_output, mask=dec_enc_attn_mask)
+        dec_output = self.pos_ffn(dec_output)
+        return dec_output, dec_slf_attn, dec_enc_attn
diff --git a/modelscope/models/cv/language_guided_video_summarization/transformer/models.py b/modelscope/models/cv/language_guided_video_summarization/transformer/models.py
new file mode 100755
index 00000000..f4ae34ee
--- /dev/null
+++ b/modelscope/models/cv/language_guided_video_summarization/transformer/models.py
@@ -0,0 +1,229 @@
+# Part of the implementation is borrowed and modified from attention-is-all-you-need-pytorch,
+# publicly available at https://github.com/jadore801120/attention-is-all-you-need-pytorch
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from .layers import DecoderLayer, EncoderLayer
+from .sub_layers import MultiHeadAttention
+
+
+class PositionalEncoding(nn.Module):
+
+    def __init__(self, d_hid, n_position=200):
+        super(PositionalEncoding, self).__init__()
+
+        # Not a parameter
+        self.register_buffer(
+            'pos_table', self._get_sinusoid_encoding_table(n_position, d_hid))
+
+    def _get_sinusoid_encoding_table(self, n_position, d_hid):
+        """Sinusoid position encoding table"""
+
+        # TODO: make it with torch instead of numpy
+
+        def get_position_angle_vec(position):
+            return [
+                position / np.power(10000, 2 * (hid_j // 2) / d_hid)
+                for hid_j in range(d_hid)
+            ]
+
+        sinusoid_table = np.array(
+            [get_position_angle_vec(pos_i) for pos_i in range(n_position)])
+        sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
+        sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
+
+        return torch.FloatTensor(sinusoid_table).unsqueeze(0)
+
+    def forward(self, x):
+        return x + self.pos_table[:, :x.size(1)].clone().detach()
+
+
+class Encoder(nn.Module):
+    """A encoder model with self attention mechanism."""
+
+    def __init__(self,
+                 d_word_vec=1024,
+                 n_layers=6,
+                 n_head=8,
+                 d_k=64,
+                 d_v=64,
+                 d_model=512,
+                 d_inner=2048,
+                 dropout=0.1,
+                 n_position=200):
+
+        super().__init__()
+
+        self.position_enc = PositionalEncoding(
+            d_word_vec, n_position=n_position)
+        self.dropout = nn.Dropout(p=dropout)
+        self.layer_stack = nn.ModuleList([
+            EncoderLayer(d_model, d_inner, n_head, d_k, d_v, dropout=dropout)
+            for _ in range(n_layers)
+        ])
+        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
+        self.d_model = d_model
+
+    def forward(self, enc_output, return_attns=False):
+
+        enc_slf_attn_list = []
+        # -- Forward
+        enc_output = self.dropout(self.position_enc(enc_output))
+        enc_output = self.layer_norm(enc_output)
+
+        for enc_layer in self.layer_stack:
+            enc_output, enc_slf_attn = enc_layer(enc_output)
+            enc_slf_attn_list += [enc_slf_attn] if return_attns else []
+
+        if return_attns:
+            return enc_output, enc_slf_attn_list
+        return enc_output,
+
+
+class Decoder(nn.Module):
+    """A decoder model with self attention mechanism."""
+
+    def __init__(self,
+                 d_word_vec=1024,
+                 n_layers=6,
+                 n_head=8,
+                 d_k=64,
+                 d_v=64,
+                 d_model=512,
+                 d_inner=2048,
+                 n_position=200,
+                 dropout=0.1):
+
+        super().__init__()
+
+        self.position_enc = PositionalEncoding(
+            d_word_vec, n_position=n_position)
+        self.dropout = nn.Dropout(p=dropout)
+        self.layer_stack = nn.ModuleList([
+            DecoderLayer(d_model, d_inner, n_head, d_k, d_v, dropout=dropout)
+            for _ in range(n_layers)
+        ])
+        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
+        self.d_model = d_model
+
+    def forward(self,
+                dec_output,
+                enc_output,
+                src_mask=None,
+                trg_mask=None,
+                return_attns=False):
+
+        dec_slf_attn_list, dec_enc_attn_list = [], []
+
+        # -- Forward
+        dec_output = self.dropout(self.position_enc(dec_output))
+        dec_output = self.layer_norm(dec_output)
+
+        for dec_layer in self.layer_stack:
+            dec_output, dec_slf_attn, dec_enc_attn = dec_layer(
+                dec_output,
+                enc_output,
+                slf_attn_mask=trg_mask,
+                dec_enc_attn_mask=src_mask)
+            dec_slf_attn_list += [dec_slf_attn] if return_attns else []
+            dec_enc_attn_list += [dec_enc_attn] if return_attns else []
+
+        if return_attns:
+            return dec_output, dec_slf_attn_list, dec_enc_attn_list
+        return dec_output,
+
+
+class Transformer(nn.Module):
+    """A sequence to sequence model with attention mechanism."""
+
+    def __init__(self,
+                 num_sentence=7,
+                 txt_atten_head=4,
+                 d_frame_vec=512,
+                 d_model=512,
+                 d_inner=2048,
+                 n_layers=6,
+                 n_head=8,
+                 d_k=256,
+                 d_v=256,
+                 dropout=0.1,
+                 n_position=4000):
+
+        super().__init__()
+
+        self.d_model = d_model
+
+        self.layer_norm_img_src = nn.LayerNorm(d_frame_vec, eps=1e-6)
+        self.layer_norm_img_trg = nn.LayerNorm(d_frame_vec, eps=1e-6)
+        self.layer_norm_txt = nn.LayerNorm(
+            num_sentence * d_frame_vec, eps=1e-6)
+
+        self.linear_txt = nn.Linear(
+            in_features=num_sentence * d_frame_vec, out_features=d_model)
+        self.lg_attention = MultiHeadAttention(
+            n_head=txt_atten_head, d_model=d_model, d_k=d_k, d_v=d_v)
+
+        self.encoder = Encoder(
+            n_position=n_position,
+            d_word_vec=d_frame_vec,
+            d_model=d_model,
+            d_inner=d_inner,
+            n_layers=n_layers,
+            n_head=n_head,
+            d_k=d_k,
+            d_v=d_v,
+            dropout=dropout)
+
+        self.decoder = Decoder(
+            n_position=n_position,
+            d_word_vec=d_frame_vec,
+            d_model=d_model,
+            d_inner=d_inner,
+            n_layers=n_layers,
+            n_head=n_head,
+            d_k=d_k,
+            d_v=d_v,
+            dropout=dropout)
+
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+
+        assert d_model == d_frame_vec, 'the dimensions of all module outputs shall be the same.'
+
+        self.linear_1 = nn.Linear(in_features=d_model, out_features=d_model)
+        self.linear_2 = nn.Linear(
+            in_features=self.linear_1.out_features, out_features=1)
+
+        self.drop = nn.Dropout(p=0.5)
+        self.norm_y = nn.LayerNorm(normalized_shape=d_model, eps=1e-6)
+        self.norm_linear = nn.LayerNorm(
+            normalized_shape=self.linear_1.out_features, eps=1e-6)
+        self.relu = nn.ReLU()
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, src_seq, src_txt, trg_seq):
+
+        features_txt = self.linear_txt(src_txt)
+        atten_seq, txt_attn = self.lg_attention(src_seq, features_txt,
+                                                features_txt)
+
+        enc_output, *_ = self.encoder(atten_seq)
+        dec_output, *_ = self.decoder(trg_seq, enc_output)
+
+        y = self.drop(enc_output)
+        y = self.norm_y(y)
+
+        # 2-layer NN (Regressor Network)
+        y = self.linear_1(y)
+        y = self.relu(y)
+        y = self.drop(y)
+        y = self.norm_linear(y)
+
+        y = self.linear_2(y)
+        y = self.sigmoid(y)
+        y = y.view(1, -1)
+
+        return y, dec_output
diff --git a/modelscope/models/cv/language_guided_video_summarization/transformer/modules.py b/modelscope/models/cv/language_guided_video_summarization/transformer/modules.py
new file mode 100755
index 00000000..03ef8eaf
--- /dev/null
+++ b/modelscope/models/cv/language_guided_video_summarization/transformer/modules.py
@@ -0,0 +1,27 @@
+# Part of the implementation is borrowed and modified from attention-is-all-you-need-pytorch,
+# publicly available at https://github.com/jadore801120/attention-is-all-you-need-pytorch
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class ScaledDotProductAttention(nn.Module):
+    """Scaled Dot-Product Attention"""
+
+    def __init__(self, temperature, attn_dropout=0.1):
+        super().__init__()
+        self.temperature = temperature
+        self.dropout = nn.Dropout(attn_dropout)
+
+    def forward(self, q, k, v, mask=None):
+
+        attn = torch.matmul(q / self.temperature, k.transpose(2, 3))
+
+        if mask is not None:
+            attn = attn.masked_fill(mask == 0, -1e9)
+
+        attn = self.dropout(F.softmax(attn, dim=-1))
+        output = torch.matmul(attn, v)
+
+        return output, attn
diff --git a/modelscope/models/cv/language_guided_video_summarization/transformer/sub_layers.py b/modelscope/models/cv/language_guided_video_summarization/transformer/sub_layers.py
new file mode 100755
index 00000000..42e10abb
--- /dev/null
+++ b/modelscope/models/cv/language_guided_video_summarization/transformer/sub_layers.py
@@ -0,0 +1,83 @@
+# Part of the implementation is borrowed and modified from attention-is-all-you-need-pytorch,
+# publicly available at https://github.com/jadore801120/attention-is-all-you-need-pytorch
+
+import numpy as np
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .modules import ScaledDotProductAttention
+
+
+class MultiHeadAttention(nn.Module):
+    """Multi-Head Attention module"""
+
+    def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
+        super().__init__()
+
+        self.n_head = n_head
+        self.d_k = d_k
+        self.d_v = d_v
+
+        self.w_qs = nn.Linear(d_model, n_head * d_k, bias=False)
+        self.w_ks = nn.Linear(d_model, n_head * d_k, bias=False)
+        self.w_vs = nn.Linear(d_model, n_head * d_v, bias=False)
+        self.fc = nn.Linear(n_head * d_v, d_model, bias=False)
+
+        self.attention = ScaledDotProductAttention(temperature=d_k**0.5)
+
+        self.dropout = nn.Dropout(dropout)
+        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
+
+    def forward(self, q, k, v, mask=None):
+
+        d_k, d_v, n_head = self.d_k, self.d_v, self.n_head
+        sz_b, len_q, len_k, len_v = q.size(0), q.size(1), k.size(1), v.size(1)
+
+        residual = q
+
+        # Pass through the pre-attention projection: b x lq x (n*dv)
+        # Separate different heads: b x lq x n x dv
+        q = self.w_qs(q).view(sz_b, len_q, n_head, d_k)
+        k = self.w_ks(k).view(sz_b, len_k, n_head, d_k)
+        v = self.w_vs(v).view(sz_b, len_v, n_head, d_v)
+
+        # Transpose for attention dot product: b x n x lq x dv
+        q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)
+
+        if mask is not None:
+            mask = mask.unsqueeze(1)  # For head axis broadcasting.
+
+        q, attn = self.attention(q, k, v, mask=mask)
+
+        # Transpose to move the head dimension back: b x lq x n x dv
+        # Combine the last two dimensions to concatenate all the heads together: b x lq x (n*dv)
+        q = q.transpose(1, 2).contiguous().view(sz_b, len_q, -1)
+        q = self.dropout(self.fc(q))
+        q += residual
+
+        q = self.layer_norm(q)
+
+        return q, attn
+
+
+class PositionwiseFeedForward(nn.Module):
+    """A two-feed-forward-layer module"""
+
+    def __init__(self, d_in, d_hid, dropout=0.1):
+        super().__init__()
+        self.w_1 = nn.Linear(d_in, d_hid)  # position-wise
+        self.w_2 = nn.Linear(d_hid, d_in)  # position-wise
+        self.layer_norm = nn.LayerNorm(d_in, eps=1e-6)
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x):
+
+        residual = x
+
+        x = self.w_2(F.relu(self.w_1(x)))
+        x = self.dropout(x)
+        x += residual
+
+        x = self.layer_norm(x)
+
+        return x
diff --git a/modelscope/models/cv/tinynas_classfication/plain_net_utils.py b/modelscope/models/cv/tinynas_classfication/plain_net_utils.py
index 844535ed..1f5c8852 100644
--- a/modelscope/models/cv/tinynas_classfication/plain_net_utils.py
+++ b/modelscope/models/cv/tinynas_classfication/plain_net_utils.py
@@ -39,7 +39,7 @@ class PlainNet(nn.Module):
                 plainnet_struct_txt = self.module_opt.plainnet_struct_txt
 
             if plainnet_struct_txt is not None:
-                with open(plainnet_struct_txt, 'r') as fid:
+                with open(plainnet_struct_txt, 'r', encoding='utf-8') as fid:
                     the_line = fid.readlines()[0].strip()
                     self.plainnet_struct = the_line
                 pass
diff --git a/modelscope/models/multi_modal/clip/bert_tokenizer.py b/modelscope/models/multi_modal/clip/bert_tokenizer.py
index 8d356f42..1ee715c9 100644
--- a/modelscope/models/multi_modal/clip/bert_tokenizer.py
+++ b/modelscope/models/multi_modal/clip/bert_tokenizer.py
@@ -120,7 +120,7 @@ def load_vocab(vocab_file):
     """Loads a vocabulary file into a dictionary."""
     vocab = collections.OrderedDict()
     index = 0
-    with open(vocab_file, 'r') as reader:
+    with open(vocab_file, 'r', encoding='utf-8') as reader:
         while True:
             token = convert_to_unicode(reader.readline())
             if not token:
diff --git a/modelscope/models/multi_modal/clip/model.py b/modelscope/models/multi_modal/clip/model.py
index 9b82e4a1..c2d82dca 100644
--- a/modelscope/models/multi_modal/clip/model.py
+++ b/modelscope/models/multi_modal/clip/model.py
@@ -523,8 +523,10 @@ class CLIPForMultiModalEmbedding(TorchModel):
         logger.info(f'Loading text model config from {text_model_config_file}')
         assert os.path.exists(text_model_config_file)
 
-        with open(vision_model_config_file,
-                  'r') as fv, open(text_model_config_file, 'r') as ft:
+        with open(
+                vision_model_config_file, 'r',
+                encoding='utf-8') as fv,\
+                open(text_model_config_file, 'r', encoding='utf-8') as ft:
             self.model_info = json.load(fv)
             for k, v in json.load(ft).items():
                 self.model_info[k] = v
diff --git a/modelscope/models/multi_modal/diffusion/model.py b/modelscope/models/multi_modal/diffusion/model.py
index 4229391f..5150a0c3 100644
--- a/modelscope/models/multi_modal/diffusion/model.py
+++ b/modelscope/models/multi_modal/diffusion/model.py
@@ -76,7 +76,7 @@ class DiffusionModel(nn.Module):
         super(DiffusionModel, self).__init__()
         # including text and generator config
         model_config = json.load(
-            open('{}/model_config.json'.format(model_dir)))
+            open('{}/model_config.json'.format(model_dir), encoding='utf-8'))
 
         # text encoder
         text_config = model_config['text_config']
@@ -142,7 +142,9 @@ class DiffusionForTextToImageSynthesis(Model):
 
         # diffusion process
         diffusion_params = json.load(
-            open('{}/diffusion_config.json'.format(model_dir)))
+            open(
+                '{}/diffusion_config.json'.format(model_dir),
+                encoding='utf-8'))
         self.diffusion_generator = make_diffusion(
             **diffusion_params['generator_config'])
         self.diffusion_upsampler_256 = make_diffusion(
diff --git a/modelscope/models/multi_modal/diffusion/structbert.py b/modelscope/models/multi_modal/diffusion/structbert.py
index d5d678ed..16c1407f 100644
--- a/modelscope/models/multi_modal/diffusion/structbert.py
+++ b/modelscope/models/multi_modal/diffusion/structbert.py
@@ -130,7 +130,7 @@ class BertConfig(object):
     @classmethod
     def from_json_file(cls, json_file):
         """Constructs a `BertConfig` from a json file of parameters."""
-        with open(json_file, 'r') as reader:
+        with open(json_file, 'r', encoding='utf-8') as reader:
             text = reader.read()
         return cls.from_dict(json.loads(text))
 
diff --git a/modelscope/models/multi_modal/diffusion/tokenizer.py b/modelscope/models/multi_modal/diffusion/tokenizer.py
index 82c09661..e2c951b1 100644
--- a/modelscope/models/multi_modal/diffusion/tokenizer.py
+++ b/modelscope/models/multi_modal/diffusion/tokenizer.py
@@ -67,7 +67,7 @@ def load_vocab(vocab_file):
     """Loads a vocabulary file into a dictionary."""
     vocab = collections.OrderedDict()
     index = 0
-    with open(vocab_file, 'r') as reader:
+    with open(vocab_file, 'r', encoding='utf-8') as reader:
         while True:
             token = convert_to_unicode(reader.readline())
             if not token:
diff --git a/modelscope/models/multi_modal/gemm/gemm_base.py b/modelscope/models/multi_modal/gemm/gemm_base.py
index 806c469c..c77a682a 100644
--- a/modelscope/models/multi_modal/gemm/gemm_base.py
+++ b/modelscope/models/multi_modal/gemm/gemm_base.py
@@ -522,7 +522,9 @@ class GEMMModel(nn.Module):
 
     def __init__(self, model_dir):
         super().__init__()
-        with open('{}/encoder_config.json'.format(model_dir), 'r') as f:
+        with open(
+                '{}/encoder_config.json'.format(model_dir), 'r',
+                encoding='utf-8') as f:
             model_config = json.loads(f.read())
         model_name = list(model_config.keys())[0]
         config_args = model_config[model_name]
diff --git a/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py b/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py
index 0cc040c6..813f750e 100644
--- a/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py
+++ b/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py
@@ -35,7 +35,9 @@ class VideoCLIPForMultiModalEmbedding(TorchModel):
     def __init__(self, model_dir, **kwargs):
         super().__init__(model_dir=model_dir, **kwargs)
         # model config parameters
-        with open(f'{model_dir}/{ModelFile.CONFIGURATION}', 'r') as json_file:
+        with open(
+                f'{model_dir}/{ModelFile.CONFIGURATION}', 'r',
+                encoding='utf-8') as json_file:
             model_config = json.load(json_file)
         model_config = model_config['paras']
         model_config['model_dir'] = model_dir
diff --git a/modelscope/models/multi_modal/mplug/configuration_mplug.py b/modelscope/models/multi_modal/mplug/configuration_mplug.py
index 914678c5..946ebb82 100644
--- a/modelscope/models/multi_modal/mplug/configuration_mplug.py
+++ b/modelscope/models/multi_modal/mplug/configuration_mplug.py
@@ -111,6 +111,6 @@ class MPlugConfig(PretrainedConfig):
     @classmethod
     def from_yaml_file(cls, yaml_file: Union[str,
                                              os.PathLike]) -> Dict[str, Any]:
-        with open(yaml_file, 'r') as reader:
+        with open(yaml_file, 'r', encoding='utf-8') as reader:
             config_dict = yaml.load(reader, Loader=yaml.Loader)
         return cls(**config_dict)
diff --git a/modelscope/models/multi_modal/multi_stage_diffusion/model.py b/modelscope/models/multi_modal/multi_stage_diffusion/model.py
index 59bd837d..58fd6698 100644
--- a/modelscope/models/multi_modal/multi_stage_diffusion/model.py
+++ b/modelscope/models/multi_modal/multi_stage_diffusion/model.py
@@ -50,7 +50,8 @@ class UnCLIP(nn.Module):
     def __init__(self, model_dir):
         super(UnCLIP, self).__init__()
         self.model_dir = model_dir
-        self.config = json.load(open(f'{model_dir}/{ModelFile.CONFIGURATION}'))
+        self.config = json.load(
+            open(f'{model_dir}/{ModelFile.CONFIGURATION}', encoding='utf-8'))
 
         # modules
         self.clip = CLIP(**self.config['clip']).fp16()
diff --git a/modelscope/models/multi_modal/ofa_for_all_tasks.py b/modelscope/models/multi_modal/ofa_for_all_tasks.py
index fc578b25..77dff54a 100644
--- a/modelscope/models/multi_modal/ofa_for_all_tasks.py
+++ b/modelscope/models/multi_modal/ofa_for_all_tasks.py
@@ -312,7 +312,7 @@ class OfaForAllTasks(TorchModel):
         if self.cfg.model.get('answer2label', None):
             ans2label_file = osp.join(self.model_dir,
                                       self.cfg.model.answer2label)
-            with open(ans2label_file, 'r') as reader:
+            with open(ans2label_file, 'r', encoding='utf-8') as reader:
                 self.ans2label_dict = json.load(reader)
 
     def save_pretrained(self,
diff --git a/modelscope/models/nlp/mglm/arguments.py b/modelscope/models/nlp/mglm/arguments.py
index 13b3aeab..4fa33c65 100755
--- a/modelscope/models/nlp/mglm/arguments.py
+++ b/modelscope/models/nlp/mglm/arguments.py
@@ -743,7 +743,7 @@ def get_args():
 
     if hasattr(args, 'deepspeed'
                ) and args.deepspeed and args.deepspeed_config is not None:
-        with open(args.deepspeed_config) as file:
+        with open(args.deepspeed_config, encoding='utf-8') as file:
             deepspeed_config = json.load(file)
         if 'train_micro_batch_size_per_gpu' in deepspeed_config:
             args.batch_size = deepspeed_config[
diff --git a/modelscope/models/nlp/mglm/data_utils/corpora.py b/modelscope/models/nlp/mglm/data_utils/corpora.py
index 7c6f58f8..cf756c0a 100755
--- a/modelscope/models/nlp/mglm/data_utils/corpora.py
+++ b/modelscope/models/nlp/mglm/data_utils/corpora.py
@@ -156,7 +156,7 @@ class DataReader:
         def read_input_to_queue():
             for path in paths:
                 print_rank_0(f'Start reading {path}')
-                with open(path) as file:
+                with open(path, encoding='utf-8') as file:
                     items = json.load(file)
                     for item in items:
                         task_queue.put(item)
diff --git a/modelscope/models/nlp/mglm/data_utils/datasets.py b/modelscope/models/nlp/mglm/data_utils/datasets.py
index 777b7d43..39ffaea3 100644
--- a/modelscope/models/nlp/mglm/data_utils/datasets.py
+++ b/modelscope/models/nlp/mglm/data_utils/datasets.py
@@ -511,12 +511,12 @@ class json_dataset(data.Dataset):
 
     def load_json_stream(self, load_path):
         if not self.loose_json:
-            jsons = json.load(open(load_path, 'r'))
+            jsons = json.load(open(load_path, 'r', encoding='utf-8'))
             generator = iter(jsons)
         else:
 
             def gen_helper():
-                with open(load_path, 'r') as f:
+                with open(load_path, 'r', encoding='utf-8') as f:
                     for row in f:
                         yield json.loads(row)
 
diff --git a/modelscope/models/nlp/mglm/data_utils/extraction.py b/modelscope/models/nlp/mglm/data_utils/extraction.py
index 53027e4f..da062f34 100644
--- a/modelscope/models/nlp/mglm/data_utils/extraction.py
+++ b/modelscope/models/nlp/mglm/data_utils/extraction.py
@@ -29,7 +29,9 @@ with open(output_path, 'w') as output:
             print(filename)
             article_lines = []
             article_open = False
-            with open(filename, mode='r', newline='\n') as file:
+            with open(
+                    filename, mode='r', newline='\n',
+                    encoding='utf-8') as file:
                 for line in file:
                     line = line.rstrip()
                     if '<doc id=' in line:
diff --git a/modelscope/models/nlp/mglm/data_utils/tokenization_gpt2.py b/modelscope/models/nlp/mglm/data_utils/tokenization_gpt2.py
index d179e055..84947ee8 100644
--- a/modelscope/models/nlp/mglm/data_utils/tokenization_gpt2.py
+++ b/modelscope/models/nlp/mglm/data_utils/tokenization_gpt2.py
@@ -179,7 +179,7 @@ class GPT2Tokenizer(object):
                  special_tokens=None,
                  max_len=None):
         self.max_len = max_len if max_len is not None else int(1e12)
-        self.encoder = json.load(open(vocab_file))
+        self.encoder = json.load(open(vocab_file), encoding='utf-8')
         self.decoder = {v: k for k, v in self.encoder.items()}
         self.errors = errors  # how to handle errors in decoding
         self.byte_encoder = bytes_to_unicode()
diff --git a/modelscope/models/nlp/mglm/process_grid.py b/modelscope/models/nlp/mglm/process_grid.py
index d425c970..e2d26cce 100644
--- a/modelscope/models/nlp/mglm/process_grid.py
+++ b/modelscope/models/nlp/mglm/process_grid.py
@@ -19,7 +19,7 @@ for dir_path in glob.glob(path_pattern, recursive=True):
     valid_path = os.path.join(dir_path, 'results.json')
     if os.path.exists(valid_path):
         print(entry)
-        with open(valid_path) as file:
+        with open(valid_path, encoding='utf-8') as file:
             valid_result = json.load(file)
     else:
         print(f'{entry} no validation results')
diff --git a/modelscope/models/nlp/mglm/tasks/language_model/dataset.py b/modelscope/models/nlp/mglm/tasks/language_model/dataset.py
index cfdfa714..f2ecec37 100644
--- a/modelscope/models/nlp/mglm/tasks/language_model/dataset.py
+++ b/modelscope/models/nlp/mglm/tasks/language_model/dataset.py
@@ -121,7 +121,7 @@ class LambadaDataset(torch.utils.data.Dataset):
 
         self.tokens = []
         self.labels = []
-        with open(data_path, 'r') as f:
+        with open(data_path, 'r', encoding='utf-8') as f:
             for line in f.readlines():
                 text = json.loads(line)['text']
                 tokens, labels = self.get_tokens(text)
diff --git a/modelscope/models/nlp/mglm/tasks/seq2seq/dataset.py b/modelscope/models/nlp/mglm/tasks/seq2seq/dataset.py
index 6a4e275f..0e209e1a 100644
--- a/modelscope/models/nlp/mglm/tasks/seq2seq/dataset.py
+++ b/modelscope/models/nlp/mglm/tasks/seq2seq/dataset.py
@@ -209,14 +209,16 @@ class XSumProcessor:
             raise NotImplementedError(split)
         print_rank_0(f'Creating XSUM-{split} dataset from {self.data_dir}')
         with open(
-                os.path.join(
-                    self.data_dir,
-                    'XSum-TRAINING-DEV-TEST-SPLIT-90-5-5.json')) as file:
+                os.path.join(self.data_dir,
+                             'XSum-TRAINING-DEV-TEST-SPLIT-90-5-5.json'),
+                encoding='utf-8') as file:
             id_list = json.load(file)
         id_list = id_list[key]
         source_texts, target_texts = [], []
         for i, idx in enumerate(id_list):
-            with open(os.path.join(self.data_dir, f'{idx}.summary')) as file:
+            with open(
+                    os.path.join(self.data_dir, f'{idx}.summary'),
+                    encoding='utf-8') as file:
                 key, sentences = None, []
                 source_text, target_text = None, None
                 for line in file:
diff --git a/modelscope/models/nlp/mglm/tasks/superglue/dataset.py b/modelscope/models/nlp/mglm/tasks/superglue/dataset.py
index 36367671..da1fb3d8 100644
--- a/modelscope/models/nlp/mglm/tasks/superglue/dataset.py
+++ b/modelscope/models/nlp/mglm/tasks/superglue/dataset.py
@@ -841,7 +841,7 @@ class RaceProcessor(DataProcessor):
             path, 'middle', '*.txt')) + glob.glob(
                 os.path.join(path, 'high', '*.txt'))
         for filename in filenames:
-            with open(filename, 'r') as f:
+            with open(filename, 'r', encoding='utf-8') as f:
                 for line in f:
                     data = json.loads(line)
                     idx = data['id']
@@ -1127,7 +1127,7 @@ class AgnewsProcessor(DataProcessor):
     def _create_examples(path: str, set_type: str) -> List[InputExample]:
         examples = []
 
-        with open(path) as f:
+        with open(path, encoding='utf-8') as f:
             reader = csv.reader(f, delimiter=',')
             for idx, row in enumerate(reader):
                 label, headline, body = row
@@ -1209,7 +1209,7 @@ class YelpPolarityProcessor(DataProcessor):
     def _create_examples(path: str, set_type: str) -> List[InputExample]:
         examples = []
 
-        with open(path) as f:
+        with open(path, encoding='utf-8') as f:
             reader = csv.reader(f, delimiter=',')
             for idx, row in enumerate(reader):
                 label, body = row
@@ -1419,7 +1419,7 @@ class SquadProcessor(DataProcessor):
     @staticmethod
     def _create_examples(path: str, set_type: str) -> List[InputExample]:
         examples = []
-        with open(path) as f:
+        with open(path, encoding='utf-8') as f:
             data = json.load(f)['data']
 
         for idx, passage in enumerate(data):
diff --git a/modelscope/models/nlp/mglm/tasks/superglue/pvp.py b/modelscope/models/nlp/mglm/tasks/superglue/pvp.py
index ff394172..e149f503 100644
--- a/modelscope/models/nlp/mglm/tasks/superglue/pvp.py
+++ b/modelscope/models/nlp/mglm/tasks/superglue/pvp.py
@@ -538,7 +538,7 @@ class PVP(ABC):
             dict)  # type: Dict[int, Dict[str, List[str]]]
         current_pattern_id = None
 
-        with open(path, 'r') as fh:
+        with open(path, 'r', encoding='utf-8') as fh:
             for line in fh.read().splitlines():
                 if line.isdigit():
                     current_pattern_id = int(line)
diff --git a/modelscope/models/nlp/mglm/utils.py b/modelscope/models/nlp/mglm/utils.py
index 2bfcf8c0..0e781189 100644
--- a/modelscope/models/nlp/mglm/utils.py
+++ b/modelscope/models/nlp/mglm/utils.py
@@ -77,7 +77,7 @@ def print_and_save_args(args, verbose=True, log_dir=None):
         with open(json_file, 'w') as output:
             json.dump(vars(args), output, sort_keys=True)
         if args.deepspeed and args.deepspeed_config is not None:
-            with open(args.deepspeed_config) as file:
+            with open(args.deepspeed_config, encoding='utf-8') as file:
                 deepspeed_config = json.load(file)
             deepspeed_json_file = os.path.join(log_dir,
                                                'config_gpt_large.json')
@@ -324,7 +324,7 @@ def get_checkpoint_iteration(load_path):
         print_rank_0('    will not load any checkpoints and will start from '
                      'random')
         return load_path, 0, False, False
-    with open(tracker_filename, 'r') as f:
+    with open(tracker_filename, 'r', encoding='utf-8') as f:
         metastring = f.read().strip()
         release = metastring == 'release'
         # try:
diff --git a/modelscope/models/science/unifold/data/residue_constants.py b/modelscope/models/science/unifold/data/residue_constants.py
index beebfe89..2701ee38 100644
--- a/modelscope/models/science/unifold/data/residue_constants.py
+++ b/modelscope/models/science/unifold/data/residue_constants.py
@@ -443,7 +443,7 @@ def load_stereo_chemical_props():
     stereo_chemical_props_path = os.path.join(
         os.path.dirname(os.path.abspath(__file__)),
         'stereo_chemical_props.txt')
-    with open(stereo_chemical_props_path, 'rt') as f:
+    with open(stereo_chemical_props_path, 'rt', encoding='utf-8') as f:
         stereo_chemical_props = f.read()
     lines_iter = iter(stereo_chemical_props.splitlines())
     # Load bond lengths.
diff --git a/modelscope/models/science/unifold/dataset.py b/modelscope/models/science/unifold/dataset.py
index 29e1a8b0..f14c2ef7 100644
--- a/modelscope/models/science/unifold/dataset.py
+++ b/modelscope/models/science/unifold/dataset.py
@@ -250,7 +250,7 @@ class UnifoldDataset(UnicoreDataset):
         self.path = data_path
 
         def load_json(filename):
-            return json.load(open(filename, 'r'))
+            return json.load(open(filename, 'r', encoding='utf-8'))
 
         sample_weight = load_json(
             os.path.join(self.path,
@@ -400,7 +400,8 @@ class UnifoldMultimerDataset(UnifoldDataset):
         self.pdb_assembly = json.load(
             open(
                 os.path.join(self.data_path,
-                             json_prefix + 'pdb_assembly.json')))
+                             json_prefix + 'pdb_assembly.json'),
+                encoding='utf-8'))
         self.pdb_chains = self.get_chains(self.inverse_multi_label)
         self.monomer_feature_path = os.path.join(self.data_path,
                                                  'pdb_features')
diff --git a/modelscope/models/science/unifold/msa/pipeline.py b/modelscope/models/science/unifold/msa/pipeline.py
index b7889bff..8037e50e 100644
--- a/modelscope/models/science/unifold/msa/pipeline.py
+++ b/modelscope/models/science/unifold/msa/pipeline.py
@@ -99,7 +99,7 @@ def run_msa_tool(
             f.write(result[msa_format])
     else:
         logging.warning('Reading MSA from file %s', msa_out_path)
-        with open(msa_out_path, 'r') as f:
+        with open(msa_out_path, 'r', encoding='utf-8') as f:
             result = {msa_format: f.read()}
     return result
 
@@ -153,7 +153,7 @@ class DataPipeline:
     def process(self, input_fasta_path: str,
                 msa_output_dir: str) -> FeatureDict:
         """Runs alignment tools on the input sequence and creates features."""
-        with open(input_fasta_path) as f:
+        with open(input_fasta_path, encoding='utf-8') as f:
             input_fasta_str = f.read()
         input_seqs, input_descs = parsers.parse_fasta(input_fasta_str)
         if len(input_seqs) != 1:
diff --git a/modelscope/models/science/unifold/msa/templates.py b/modelscope/models/science/unifold/msa/templates.py
index fe3bcef9..d1ff8cf1 100644
--- a/modelscope/models/science/unifold/msa/templates.py
+++ b/modelscope/models/science/unifold/msa/templates.py
@@ -155,7 +155,7 @@ def _parse_release_dates(path: str) -> Mapping[str, datetime.datetime]:
     """Parses release dates file, returns a mapping from PDBs to release dates."""
     if path.endswith('txt'):
         release_dates = {}
-        with open(path, 'r') as f:
+        with open(path, 'r', encoding='utf-8') as f:
             for line in f:
                 pdb_id, date = line.split(':')
                 date = date.strip()
diff --git a/modelscope/msdatasets/task_datasets/movie_scene_segmentation/movie_scene_segmentation_dataset.py b/modelscope/msdatasets/task_datasets/movie_scene_segmentation/movie_scene_segmentation_dataset.py
index 68cbf918..49991b11 100644
--- a/modelscope/msdatasets/task_datasets/movie_scene_segmentation/movie_scene_segmentation_dataset.py
+++ b/modelscope/msdatasets/task_datasets/movie_scene_segmentation/movie_scene_segmentation_dataset.py
@@ -106,14 +106,14 @@ class MovieSceneSegmentationDataset(TorchTaskDataset):
         self.tmpl = '{}/shot_{}_img_{}.jpg'  # video_id, shot_id, shot_num
 
         if not self.test_mode:
-            with open(self.ann_file) as f:
+            with open(self.ann_file, encoding='utf-8') as f:
                 self.anno_data = json.load(f)
             self.vidsid2label = {
                 f"{it['video_id']}_{it['shot_id']}": it['boundary_label']
                 for it in self.anno_data
             }
         else:
-            with open(self.ann_file) as f:
+            with open(self.ann_file, encoding='utf-8') as f:
                 self.anno_data = json.load(f)
 
     def init_sampler(self, cfg):
diff --git a/modelscope/msdatasets/task_datasets/referring_video_object_segmentation/referring_video_object_segmentation_dataset.py b/modelscope/msdatasets/task_datasets/referring_video_object_segmentation/referring_video_object_segmentation_dataset.py
index c90351e9..8b6d22a4 100644
--- a/modelscope/msdatasets/task_datasets/referring_video_object_segmentation/referring_video_object_segmentation_dataset.py
+++ b/modelscope/msdatasets/task_datasets/referring_video_object_segmentation/referring_video_object_segmentation_dataset.py
@@ -146,7 +146,7 @@ class ReferringVideoObjectSegmentationDataset(TorchTaskDataset):
         saved_annotations_file_path = osp.join(
             root_path, f'sentences_single_frame_{subset}_annotations.json')
         if osp.exists(saved_annotations_file_path):
-            with open(saved_annotations_file_path, 'r') as f:
+            with open(saved_annotations_file_path, 'r', encoding='utf-8') as f:
                 text_annotations_by_frame = [tuple(a) for a in json.load(f)]
                 return text_annotations_by_frame
         elif (distributed and dist.get_rank() == 0) or not distributed:
@@ -203,7 +203,7 @@ class ReferringVideoObjectSegmentationDataset(TorchTaskDataset):
                 json.dump(text_annotations_by_frame, f)
         if distributed:
             dist.barrier()
-            with open(saved_annotations_file_path, 'r') as f:
+            with open(saved_annotations_file_path, 'r', encoding='utf-8') as f:
                 text_annotations_by_frame = [tuple(a) for a in json.load(f)]
         return text_annotations_by_frame
 
@@ -267,8 +267,10 @@ def get_text_annotations_gt(root_path, subset):
         osp.join(root_path, 'Release/videoset.csv'), header=None)
     # 'vid', 'label', 'start_time', 'end_time', 'height', 'width', 'total_frames', 'annotated_frames', 'subset'
     a2d_data_info.columns = ['vid', '', '', '', '', '', '', '', 'subset']
-    with open(osp.join(root_path, 'text_annotations/missed_videos.txt'),
-              'r') as f:
+    with open(
+            osp.join(root_path, 'text_annotations/missed_videos.txt'),
+            'r',
+            encoding='utf-8') as f:
         unused_videos = f.read().splitlines()
     subsets = {'train': 0, 'test': 1}
     # filter unused videos and videos which do not belong to our train/test subset:
diff --git a/modelscope/msdatasets/task_datasets/video_summarization_dataset.py b/modelscope/msdatasets/task_datasets/video_summarization_dataset.py
index 34eb0450..02639be8 100644
--- a/modelscope/msdatasets/task_datasets/video_summarization_dataset.py
+++ b/modelscope/msdatasets/task_datasets/video_summarization_dataset.py
@@ -26,7 +26,7 @@ class VideoSummarizationDataset(TorchTaskDataset):
         self.list_n_frames = []
         self.list_positions = []
 
-        with open(self.split_filename) as f:
+        with open(self.split_filename, encoding='utf-8') as f:
             data = json.loads(f.read())
             for i, split in enumerate(data):
                 if i == self.split_index:
diff --git a/modelscope/pipelines/audio/asr_inference_pipeline.py b/modelscope/pipelines/audio/asr_inference_pipeline.py
index 6a4864bf..da339083 100644
--- a/modelscope/pipelines/audio/asr_inference_pipeline.py
+++ b/modelscope/pipelines/audio/asr_inference_pipeline.py
@@ -116,7 +116,7 @@ class AutomaticSpeechRecognitionPipeline(Pipeline):
         }
 
         if self.framework == Frameworks.torch:
-            config_file = open(inputs['asr_model_config'])
+            config_file = open(inputs['asr_model_config'], encoding='utf-8')
             root = yaml.full_load(config_file)
             config_file.close()
             frontend_conf = None
diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py
index 97cd8761..5e9220bd 100644
--- a/modelscope/pipelines/cv/__init__.py
+++ b/modelscope/pipelines/cv/__init__.py
@@ -59,6 +59,7 @@ if TYPE_CHECKING:
     from .mtcnn_face_detection_pipeline import MtcnnFaceDetectionPipelin
     from .hand_static_pipeline import HandStaticPipeline
     from .referring_video_object_segmentation_pipeline import ReferringVideoObjectSegmentationPipeline
+    from .language_guided_video_summarization_pipeline import LanguageGuidedVideoSummarizationPipeline
 
 else:
     _import_structure = {
@@ -132,6 +133,9 @@ else:
         'referring_video_object_segmentation_pipeline': [
             'ReferringVideoObjectSegmentationPipeline'
         ],
+        'language_guided_video_summarization_pipeline': [
+            'LanguageGuidedVideoSummarizationPipeline'
+        ]
     }
 
     import sys
diff --git a/modelscope/pipelines/cv/animal_recognition_pipeline.py b/modelscope/pipelines/cv/animal_recognition_pipeline.py
index 671a5b4c..6d395a46 100644
--- a/modelscope/pipelines/cv/animal_recognition_pipeline.py
+++ b/modelscope/pipelines/cv/animal_recognition_pipeline.py
@@ -109,7 +109,7 @@ class AnimalRecognitionPipeline(Pipeline):
 
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
         label_mapping_path = osp.join(self.local_path, 'label_mapping.txt')
-        with open(label_mapping_path, 'r') as f:
+        with open(label_mapping_path, 'r', encoding='utf-8') as f:
             label_mapping = f.readlines()
         score = torch.max(inputs['outputs'])
         inputs = {
diff --git a/modelscope/pipelines/cv/general_recognition_pipeline.py b/modelscope/pipelines/cv/general_recognition_pipeline.py
index 80f6f88a..c1136882 100644
--- a/modelscope/pipelines/cv/general_recognition_pipeline.py
+++ b/modelscope/pipelines/cv/general_recognition_pipeline.py
@@ -110,7 +110,7 @@ class GeneralRecognitionPipeline(Pipeline):
 
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
         label_mapping_path = osp.join(self.local_path, 'meta_info.txt')
-        with open(label_mapping_path, 'r') as f:
+        with open(label_mapping_path, 'r', encoding='utf-8') as f:
             label_mapping = f.readlines()
         score = torch.max(inputs['outputs'])
         inputs = {
diff --git a/modelscope/pipelines/cv/language_guided_video_summarization_pipeline.py b/modelscope/pipelines/cv/language_guided_video_summarization_pipeline.py
new file mode 100755
index 00000000..059dadb7
--- /dev/null
+++ b/modelscope/pipelines/cv/language_guided_video_summarization_pipeline.py
@@ -0,0 +1,250 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+import os.path as osp
+import random
+import shutil
+import tempfile
+from typing import Any, Dict
+
+import clip
+import cv2
+import numpy as np
+import torch
+from PIL import Image
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.language_guided_video_summarization import \
+    ClipItVideoSummarization
+from modelscope.models.cv.language_guided_video_summarization.summarizer import (
+    extract_video_features, video_features_to_txt)
+from modelscope.models.cv.video_summarization import summary_format
+from modelscope.models.cv.video_summarization.summarizer import (
+    generate_summary, get_change_points)
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.language_guided_video_summarization,
+    module_name=Pipelines.language_guided_video_summarization)
+class LanguageGuidedVideoSummarizationPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a language guided video summarization pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, auto_collate=False, **kwargs)
+        logger.info(f'loading model from {model}')
+        self.model_dir = model
+
+        self.tmp_dir = kwargs.get('tmp_dir', None)
+        if self.tmp_dir is None:
+            self.tmp_dir = tempfile.TemporaryDirectory().name
+
+        config_path = osp.join(model, ModelFile.CONFIGURATION)
+        logger.info(f'loading config from {config_path}')
+        self.cfg = Config.from_file(config_path)
+
+        self.clip_model, self.clip_preprocess = clip.load(
+            'ViT-B/32',
+            device=self.device,
+            download_root=os.path.join(self.model_dir, 'clip'))
+
+        self.clipit_model = ClipItVideoSummarization(model)
+        self.clipit_model = self.clipit_model.to(self.device).eval()
+
+        logger.info('load model done')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        if not isinstance(input, tuple):
+            raise TypeError(f'input should be a str,'
+                            f'  but got {type(input)}')
+
+        video_path, sentences = input
+
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+
+        frames = []
+        picks = []
+        cap = cv2.VideoCapture(video_path)
+        self.fps = cap.get(cv2.CAP_PROP_FPS)
+        self.frame_count = cap.get(cv2.CAP_PROP_FRAME_COUNT)
+        frame_idx = 0
+        # extract 1 frame every 15 frames in the video and save the frame index
+        while (cap.isOpened()):
+            ret, frame = cap.read()
+            if not ret:
+                break
+            if frame_idx % 15 == 0:
+                frames.append(frame)
+                picks.append(frame_idx)
+            frame_idx += 1
+        n_frame = frame_idx
+
+        if sentences is None:
+            logger.info('input sentences is none, using sentences from video!')
+
+            tmp_path = os.path.join(self.tmp_dir, 'tmp')
+            i3d_flow_path = os.path.join(self.model_dir, 'i3d/i3d_flow.pt')
+            i3d_rgb_path = os.path.join(self.model_dir, 'i3d/i3d_rgb.pt')
+            kinetics_class_labels = os.path.join(self.model_dir,
+                                                 'i3d/label_map.txt')
+            pwc_path = os.path.join(self.model_dir, 'i3d/pwc_net.pt')
+            vggish_model_path = os.path.join(self.model_dir,
+                                             'vggish/vggish_model.ckpt')
+            vggish_pca_path = os.path.join(self.model_dir,
+                                           'vggish/vggish_pca_params.npz')
+
+            device = torch.device(
+                'cuda' if torch.cuda.is_available() else 'cpu')
+            i3d_feats = extract_video_features(
+                video_path=video_path,
+                feature_type='i3d',
+                tmp_path=tmp_path,
+                i3d_flow_path=i3d_flow_path,
+                i3d_rgb_path=i3d_rgb_path,
+                kinetics_class_labels=kinetics_class_labels,
+                pwc_path=pwc_path,
+                vggish_model_path=vggish_model_path,
+                vggish_pca_path=vggish_pca_path,
+                extraction_fps=2,
+                device=device)
+            rgb = i3d_feats['rgb']
+            flow = i3d_feats['flow']
+
+            device = '/gpu:0' if torch.cuda.is_available() else '/cpu:0'
+            vggish = extract_video_features(
+                video_path=video_path,
+                feature_type='vggish',
+                tmp_path=tmp_path,
+                i3d_flow_path=i3d_flow_path,
+                i3d_rgb_path=i3d_rgb_path,
+                kinetics_class_labels=kinetics_class_labels,
+                pwc_path=pwc_path,
+                vggish_model_path=vggish_model_path,
+                vggish_pca_path=vggish_pca_path,
+                extraction_fps=2,
+                device=device)
+            audio = vggish['audio']
+
+            duration_in_secs = float(self.frame_count) / self.fps
+
+            txt = video_features_to_txt(
+                duration_in_secs=duration_in_secs,
+                pretrained_cap_model_path=os.path.join(
+                    self.model_dir, 'bmt/sample/best_cap_model.pt'),
+                prop_generator_model_path=os.path.join(
+                    self.model_dir, 'bmt/sample/best_prop_model.pt'),
+                features={
+                    'rgb': rgb,
+                    'flow': flow,
+                    'audio': audio
+                },
+                device_id=0)
+            sentences = [item['sentence'] for item in txt]
+
+        clip_image_features = []
+        for frame in frames:
+            x = self.clip_preprocess(
+                Image.fromarray(cv2.cvtColor(
+                    frame, cv2.COLOR_BGR2RGB))).unsqueeze(0).to(self.device)
+            with torch.no_grad():
+                f = self.clip_model.encode_image(x).squeeze(0).cpu().numpy()
+            clip_image_features.append(f)
+
+        clip_txt_features = []
+        for sentence in sentences:
+            text_input = clip.tokenize(sentence).to(self.device)
+            with torch.no_grad():
+                text_feature = self.clip_model.encode_text(text_input).squeeze(
+                    0).cpu().numpy()
+            clip_txt_features.append(text_feature)
+        clip_txt_features = self.sample_txt_feateures(clip_txt_features)
+        clip_txt_features = np.array(clip_txt_features).reshape((1, -1))
+
+        result = {
+            'video_name': video_path,
+            'clip_image_features': np.array(clip_image_features),
+            'clip_txt_features': np.array(clip_txt_features),
+            'n_frame': n_frame,
+            'picks': np.array(picks)
+        }
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        clip_image_features = input['clip_image_features']
+        clip_txt_features = input['clip_txt_features']
+        clip_image_features = self.norm_feature(clip_image_features)
+        clip_txt_features = self.norm_feature(clip_txt_features)
+
+        change_points, n_frame_per_seg = get_change_points(
+            clip_image_features, input['n_frame'])
+
+        summary = self.inference(clip_image_features, clip_txt_features,
+                                 input['n_frame'], input['picks'],
+                                 change_points)
+
+        output = summary_format(summary, self.fps)
+
+        return {OutputKeys.OUTPUT: output}
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        if os.path.exists(self.tmp_dir):
+            shutil.rmtree(self.tmp_dir)
+        return inputs
+
+    def inference(self, clip_image_features, clip_txt_features, n_frames,
+                  picks, change_points):
+        clip_image_features = torch.from_numpy(
+            np.array(clip_image_features, np.float32)).unsqueeze(0)
+        clip_txt_features = torch.from_numpy(
+            np.array(clip_txt_features, np.float32)).unsqueeze(0)
+        picks = np.array(picks, np.int32)
+
+        with torch.no_grad():
+            results = self.clipit_model(
+                dict(
+                    frame_features=clip_image_features,
+                    txt_features=clip_txt_features))
+            scores = results['scores']
+            if not scores.device.type == 'cpu':
+                scores = scores.cpu()
+            scores = scores.squeeze(0).numpy().tolist()
+            summary = generate_summary([change_points], [scores], [n_frames],
+                                       [picks])[0]
+
+        return summary.tolist()
+
+    def sample_txt_feateures(self, feat, num=7):
+        while len(feat) < num:
+            feat.append(feat[-1])
+        idxes = list(np.arange(0, len(feat)))
+        samples_idx = []
+        for ii in range(num):
+            idx = random.choice(idxes)
+            while idx in samples_idx:
+                idx = random.choice(idxes)
+            samples_idx.append(idx)
+        samples_idx.sort()
+
+        samples = []
+        for idx in samples_idx:
+            samples.append(feat[idx])
+        return samples
+
+    def norm_feature(self, frames_feat):
+        for ii in range(len(frames_feat)):
+            frame_feat = frames_feat[ii]
+            frames_feat[ii] = frame_feat / np.linalg.norm(frame_feat)
+        frames_feat = frames_feat.reshape((frames_feat.shape[0], -1))
+        return frames_feat
diff --git a/modelscope/pipelines/cv/ocr_recognition_pipeline.py b/modelscope/pipelines/cv/ocr_recognition_pipeline.py
index e81467a1..d90f8db6 100644
--- a/modelscope/pipelines/cv/ocr_recognition_pipeline.py
+++ b/modelscope/pipelines/cv/ocr_recognition_pipeline.py
@@ -49,7 +49,7 @@ class OCRRecognitionPipeline(Pipeline):
         self.infer_model.load_state_dict(
             torch.load(model_path, map_location=self.device))
         self.labelMapping = dict()
-        with open(label_path, 'r') as f:
+        with open(label_path, 'r', encoding='utf-8') as f:
             lines = f.readlines()
             cnt = 2
             for line in lines:
diff --git a/modelscope/pipelines/cv/referring_video_object_segmentation_pipeline.py b/modelscope/pipelines/cv/referring_video_object_segmentation_pipeline.py
index cfbf2607..f0a717a5 100644
--- a/modelscope/pipelines/cv/referring_video_object_segmentation_pipeline.py
+++ b/modelscope/pipelines/cv/referring_video_object_segmentation_pipeline.py
@@ -138,6 +138,19 @@ class ReferringVideoObjectSegmentationPipeline(Pipeline):
             video_np = rearrange(self.video,
                                  't c h w -> t h w c').numpy() / 255.0
 
+            # set font for text query in output video
+            if self.model.cfg.pipeline.output_font:
+                try:
+                    font = ImageFont.truetype(
+                        font=self.model.cfg.pipeline.output_font,
+                        size=self.model.cfg.pipeline.output_font_size)
+                except OSError:
+                    logger.error('can\'t open resource %s, load default font'
+                                 % self.model.cfg.pipeline.output_font)
+                    font = ImageFont.load_default()
+            else:
+                font = ImageFont.load_default()
+
             # del video
             pred_masks_per_frame = rearrange(
                 torch.stack(inputs), 'q t 1 h w -> t q h w').numpy()
@@ -158,12 +171,6 @@ class ReferringVideoObjectSegmentationPipeline(Pipeline):
                 W, H = vid_frame.size
                 draw = ImageDraw.Draw(vid_frame)
 
-                if self.model.cfg.pipeline.output_font:
-                    font = ImageFont.truetype(
-                        font=self.model.cfg.pipeline.output_font,
-                        size=self.model.cfg.pipeline.output_font_size)
-                else:
-                    font = ImageFont.load_default()
                 for i, (text_query, color) in enumerate(
                         zip(self.text_queries, colors), start=1):
                     w, h = draw.textsize(text_query, font=font)
@@ -173,9 +180,6 @@ class ReferringVideoObjectSegmentationPipeline(Pipeline):
                               fill=tuple(color) + (255, ),
                               font=font)
                 masked_video.append(np.array(vid_frame))
-            print(type(vid_frame))
-            print(type(masked_video[0]))
-            print(masked_video[0].shape)
             # generate and save the output clip:
 
             assert self.model.cfg.pipeline.output_path
diff --git a/modelscope/pipelines/cv/tinynas_classification_pipeline.py b/modelscope/pipelines/cv/tinynas_classification_pipeline.py
index a470e58b..4dfd5c51 100644
--- a/modelscope/pipelines/cv/tinynas_classification_pipeline.py
+++ b/modelscope/pipelines/cv/tinynas_classification_pipeline.py
@@ -82,7 +82,7 @@ class TinynasClassificationPipeline(Pipeline):
 
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
         label_mapping_path = osp.join(self.path, 'label_map.txt')
-        f = open(label_mapping_path)
+        f = open(label_mapping_path, encoding='utf-8')
         content = f.read()
         f.close()
         label_dict = eval(content)
diff --git a/modelscope/pipelines/cv/video_category_pipeline.py b/modelscope/pipelines/cv/video_category_pipeline.py
index e4c73649..4c52205e 100644
--- a/modelscope/pipelines/cv/video_category_pipeline.py
+++ b/modelscope/pipelines/cv/video_category_pipeline.py
@@ -36,7 +36,7 @@ class VideoCategoryPipeline(Pipeline):
         super().__init__(model=model, **kwargs)
         config_path = osp.join(self.model, ModelFile.CONFIGURATION)
         logger.info(f'loading configuration from {config_path}')
-        with open(config_path, 'r') as f:
+        with open(config_path, 'r', encoding='utf-8') as f:
             config = json.load(f)
             self.frame_num = config['frame_num']
             self.level_1_num = config['level_1_num']
diff --git a/modelscope/pipelines/nlp/table_question_answering_pipeline.py b/modelscope/pipelines/nlp/table_question_answering_pipeline.py
index b75a8153..bde78196 100644
--- a/modelscope/pipelines/nlp/table_question_answering_pipeline.py
+++ b/modelscope/pipelines/nlp/table_question_answering_pipeline.py
@@ -231,19 +231,6 @@ class TableQuestionAnsweringPipeline(Pipeline):
         header_ids = table['header_id'] + ['null']
         sql = result['sql']
 
-        str_sel_list, sql_sel_list = [], []
-        for idx, sel in enumerate(sql['sel']):
-            header_name = header_names[sel]
-            header_id = '`%s`.`%s`' % (table['table_id'], header_ids[sel])
-            if sql['agg'][idx] == 0:
-                str_sel_list.append(header_name)
-                sql_sel_list.append(header_id)
-            else:
-                str_sel_list.append(self.agg_ops[sql['agg'][idx]] + '('
-                                    + header_name + ')')
-                sql_sel_list.append(self.agg_ops[sql['agg'][idx]] + '('
-                                    + header_id + ')')
-
         str_cond_list, sql_cond_list = [], []
         where_conds, orderby_conds = [], []
         for cond in sql['conds']:
@@ -285,9 +272,34 @@ class TableQuestionAnsweringPipeline(Pipeline):
             if is_in:
                 str_orderby += ' LIMIT %d' % (limit_num)
                 sql_orderby += ' LIMIT %d' % (limit_num)
+            # post process null column
+            for idx, sel in enumerate(sql['sel']):
+                if sel == len(header_ids) - 1:
+                    primary_sel = 0
+                    for index, attrib in enumerate(table['header_attribute']):
+                        if attrib == 'PRIMARY':
+                            primary_sel = index
+                            break
+                    if primary_sel not in sql['sel']:
+                        sql['sel'][idx] = primary_sel
+                    else:
+                        del sql['sel'][idx]
         else:
             str_orderby = ''
 
+        str_sel_list, sql_sel_list = [], []
+        for idx, sel in enumerate(sql['sel']):
+            header_name = header_names[sel]
+            header_id = '`%s`.`%s`' % (table['table_id'], header_ids[sel])
+            if sql['agg'][idx] == 0:
+                str_sel_list.append(header_name)
+                sql_sel_list.append(header_id)
+            else:
+                str_sel_list.append(self.agg_ops[sql['agg'][idx]] + '('
+                                    + header_name + ')')
+                sql_sel_list.append(self.agg_ops[sql['agg'][idx]] + '('
+                                    + header_id + ')')
+
         if len(str_cond_list) != 0 and len(str_orderby) != 0:
             final_str = 'SELECT %s FROM %s WHERE %s ORDER BY %s' % (
                 ', '.join(str_sel_list), table['table_name'], str_where_conds,
diff --git a/modelscope/pipelines/science/protein_structure_pipeline.py b/modelscope/pipelines/science/protein_structure_pipeline.py
index 3dc51c72..1ef9aa29 100644
--- a/modelscope/pipelines/science/protein_structure_pipeline.py
+++ b/modelscope/pipelines/science/protein_structure_pipeline.py
@@ -59,8 +59,9 @@ def load_feature_for_one_target(
 
     else:
         uniprot_msa_dir = data_folder
-        sequence_ids = open(os.path.join(data_folder,
-                                         'chains.txt')).readline().split()
+        sequence_ids = open(
+            os.path.join(data_folder, 'chains.txt'),
+            encoding='utf-8').readline().split()
 
     if symmetry_group is None:
         batch, _ = load_and_process(
diff --git a/modelscope/preprocessors/audio.py b/modelscope/preprocessors/audio.py
index 1e659218..f02381ad 100644
--- a/modelscope/preprocessors/audio.py
+++ b/modelscope/preprocessors/audio.py
@@ -15,7 +15,7 @@ from modelscope.utils.constant import Fields
 
 
 def load_kaldi_feature_transform(filename):
-    fp = open(filename, 'r')
+    fp = open(filename, 'r', encoding='utf-8')
     all_str = fp.read()
     pos1 = all_str.find('AddShift')
     pos2 = all_str.find('[', pos1)
diff --git a/modelscope/preprocessors/kws.py b/modelscope/preprocessors/kws.py
index 6f09d545..33847702 100644
--- a/modelscope/preprocessors/kws.py
+++ b/modelscope/preprocessors/kws.py
@@ -78,7 +78,7 @@ class WavToLists(Preprocessor):
         assert os.path.exists(
             inputs['config_path']), 'model config yaml file does not exist'
 
-        config_file = open(inputs['config_path'])
+        config_file = open(inputs['config_path'], encoding='utf-8')
         root = yaml.full_load(config_file)
         config_file.close()
 
diff --git a/modelscope/preprocessors/multi_modal.py b/modelscope/preprocessors/multi_modal.py
index 3a3ae820..52cde61c 100644
--- a/modelscope/preprocessors/multi_modal.py
+++ b/modelscope/preprocessors/multi_modal.py
@@ -145,8 +145,9 @@ class CLIPPreprocessor(Preprocessor):
             self.image_resolution = kwargs['resolution']
         else:
             self.image_resolution = json.load(
-                open('{}/vision_model_config.json'.format(
-                    model_dir)))['image_resolution']
+                open(
+                    '{}/vision_model_config.json'.format(model_dir),
+                    encoding='utf-8'))['image_resolution']
         self.img_preprocess = self._build_image_transform()
         # key mapping
         # specify the input keys, compatible with training and inference whose key names may be different
diff --git a/modelscope/preprocessors/nlp/nlp_base.py b/modelscope/preprocessors/nlp/nlp_base.py
index 45efc6e7..7fe28eb5 100644
--- a/modelscope/preprocessors/nlp/nlp_base.py
+++ b/modelscope/preprocessors/nlp/nlp_base.py
@@ -59,8 +59,10 @@ class NLPBasePreprocessor(Preprocessor, ABC):
             self.use_fast = False
         elif self.use_fast is None and os.path.isfile(
                 os.path.join(model_dir, 'tokenizer_config.json')):
-            with open(os.path.join(model_dir, 'tokenizer_config.json'),
-                      'r') as f:
+            with open(
+                    os.path.join(model_dir, 'tokenizer_config.json'),
+                    'r',
+                    encoding='utf-8') as f:
                 json_config = json.load(f)
                 self.use_fast = json_config.get('use_fast')
         self.use_fast = False if self.use_fast is None else self.use_fast
diff --git a/modelscope/preprocessors/nlp/space/dialog_intent_prediction_preprocessor.py b/modelscope/preprocessors/nlp/space/dialog_intent_prediction_preprocessor.py
index 2923157e..5aa662fc 100644
--- a/modelscope/preprocessors/nlp/space/dialog_intent_prediction_preprocessor.py
+++ b/modelscope/preprocessors/nlp/space/dialog_intent_prediction_preprocessor.py
@@ -35,7 +35,10 @@ class DialogIntentPredictionPreprocessor(Preprocessor):
             self.model_dir, config=self.config)
 
         self.categories = None
-        with open(os.path.join(self.model_dir, 'categories.json'), 'r') as f:
+        with open(
+                os.path.join(self.model_dir, 'categories.json'),
+                'r',
+                encoding='utf-8') as f:
             self.categories = json.load(f)
         assert len(self.categories) == 77
 
diff --git a/modelscope/preprocessors/nlp/space/dst_processors.py b/modelscope/preprocessors/nlp/space/dst_processors.py
index 1f9920a9..1b6159b5 100644
--- a/modelscope/preprocessors/nlp/space/dst_processors.py
+++ b/modelscope/preprocessors/nlp/space/dst_processors.py
@@ -184,7 +184,7 @@ class multiwoz22Processor(DSTProcessor):
     # Loads the dialogue_acts.json and returns a list
     # of slot-value pairs.
     def load_acts(self, input_file):
-        with open(input_file) as f:
+        with open(input_file, encoding='utf-8') as f:
             acts = json.load(f)
         s_dict = {}
         for d in acts:
diff --git a/modelscope/preprocessors/nlp/space/fields/gen_field.py b/modelscope/preprocessors/nlp/space/fields/gen_field.py
index 1d1879fe..20b2c48a 100644
--- a/modelscope/preprocessors/nlp/space/fields/gen_field.py
+++ b/modelscope/preprocessors/nlp/space/fields/gen_field.py
@@ -359,12 +359,14 @@ class MultiWOZBPETextField(BPETextField):
             test_list = [
                 line.strip().lower() for line in open(
                     os.path.join(kwargs['data_dir'], 'testListFile.json'),
-                    'r').readlines()
+                    'r',
+                    encoding='utf-8').readlines()
             ]
             dev_list = [
                 line.strip().lower() for line in open(
                     os.path.join(kwargs['data_dir'], 'valListFile.json'),
-                    'r').readlines()
+                    'r',
+                    encoding='utf-8').readlines()
             ]
 
             self.dev_files, self.test_files = {}, {}
diff --git a/modelscope/preprocessors/nlp/space/tokenizer.py b/modelscope/preprocessors/nlp/space/tokenizer.py
index 1bd0ce11..798ce3b7 100644
--- a/modelscope/preprocessors/nlp/space/tokenizer.py
+++ b/modelscope/preprocessors/nlp/space/tokenizer.py
@@ -531,7 +531,7 @@ class GPT2Tokenizer(object):
                  special_tokens=None,
                  max_len=None):
         self.max_len = max_len if max_len is not None else int(1e12)
-        self.encoder = json.load(open(vocab_file))
+        self.encoder = json.load(open(vocab_file, encoding='utf-8'))
         self.decoder = {v: k for k, v in self.encoder.items()}
         self.errors = errors  # how to handle errors in decoding
         self.byte_encoder = bytes_to_unicode()
diff --git a/modelscope/preprocessors/nlp/space_T_cn/fields/database.py b/modelscope/preprocessors/nlp/space_T_cn/fields/database.py
index 2fef8d7e..1300cc95 100644
--- a/modelscope/preprocessors/nlp/space_T_cn/fields/database.py
+++ b/modelscope/preprocessors/nlp/space_T_cn/fields/database.py
@@ -20,9 +20,9 @@ class Database:
             self.connection_obj = sqlite3.connect(
                 ':memory:', check_same_thread=False)
             self.type_dict = {'text': 'TEXT', 'number': 'INT', 'date': 'TEXT'}
-        self.tables = self.init_tables(table_file_path=table_file_path)
         self.syn_dict = self.init_syn_dict(
             syn_dict_file_path=syn_dict_file_path)
+        self.tables = self.init_tables(table_file_path=table_file_path)
 
     def __del__(self):
         if self.is_use_sqlite:
@@ -32,12 +32,12 @@ class Database:
         tables = {}
         lines = []
         if type(table_file_path) == str:
-            with open(table_file_path, 'r') as fo:
+            with open(table_file_path, 'r', encoding='utf-8') as fo:
                 for line in fo:
                     lines.append(line)
         elif type(table_file_path) == list:
             for path in table_file_path:
-                with open(path, 'r') as fo:
+                with open(path, 'r', encoding='utf-8') as fo:
                     for line in fo:
                         lines.append(line)
         else:
@@ -75,6 +75,10 @@ class Database:
                         continue
                     word = str(cell).strip().lower()
                     trie_set[ii].insert(word, word)
+                    if word in self.syn_dict.keys():
+                        for term in self.syn_dict[word]:
+                            if term.strip() != '':
+                                trie_set[ii].insert(term, word)
 
             table['value_trie'] = trie_set
 
diff --git a/modelscope/preprocessors/nlp/space_T_en/conversational_text_to_sql_preprocessor.py b/modelscope/preprocessors/nlp/space_T_en/conversational_text_to_sql_preprocessor.py
index 00c7bcd7..0ebd857e 100644
--- a/modelscope/preprocessors/nlp/space_T_en/conversational_text_to_sql_preprocessor.py
+++ b/modelscope/preprocessors/nlp/space_T_en/conversational_text_to_sql_preprocessor.py
@@ -45,7 +45,7 @@ class ConversationalTextToSqlPreprocessor(Preprocessor):
             and torch.cuda.is_available() else 'cpu'
         self.processor = None
         self.table_path = os.path.join(self.model_dir, 'tables.json')
-        self.tables = json.load(open(self.table_path, 'r'))
+        self.tables = json.load(open(self.table_path, 'r', encoding='utf-8'))
         self.output_tables = None
         self.path_cache = []
         self.graph_processor = GraphProcessor()
@@ -89,7 +89,7 @@ class ConversationalTextToSqlPreprocessor(Preprocessor):
                 'local_db_path'] not in self.path_cache:
             self.path_cache.append(data['local_db_path'])
             path = os.path.join(data['local_db_path'], 'tables.json')
-            self.tables = json.load(open(path, 'r'))
+            self.tables = json.load(open(path, 'r', encoding='utf-8'))
             self.processor.db_dir = os.path.join(data['local_db_path'], 'db')
             self.output_tables = process_tables(self.processor, self.tables)
             Example.configuration(
diff --git a/modelscope/preprocessors/ofa/base.py b/modelscope/preprocessors/ofa/base.py
index 55b3895d..e5c30ff8 100644
--- a/modelscope/preprocessors/ofa/base.py
+++ b/modelscope/preprocessors/ofa/base.py
@@ -76,7 +76,7 @@ class OfaBasePreprocessor:
         self.constraint_trie = None
         if self.cfg.model.get('answer2label', None):
             ans2label_file = osp.join(model_dir, self.cfg.model.answer2label)
-            with open(ans2label_file, 'r') as reader:
+            with open(ans2label_file, 'r', encoding='utf-8') as reader:
                 ans2label_dict = json.load(reader)
             self.ans2label = ans2label_dict
             self.label2ans = {v: k for k, v in self.ans2label.items()}
diff --git a/modelscope/preprocessors/science/uni_fold.py b/modelscope/preprocessors/science/uni_fold.py
index 2a44c885..ae72433c 100644
--- a/modelscope/preprocessors/science/uni_fold.py
+++ b/modelscope/preprocessors/science/uni_fold.py
@@ -201,7 +201,7 @@ def run_mmseqs2(
     a3m_lines = {}
     for a3m_file in a3m_files:
         update_M, M = True, None
-        with open(a3m_file, 'r') as f:
+        with open(a3m_file, 'r', encoding='utf-8') as f:
             lines = f.readlines()
             for line in lines:
                 if len(line) > 0:
diff --git a/modelscope/trainers/nlp/csanmt_translation_trainer.py b/modelscope/trainers/nlp/csanmt_translation_trainer.py
index c93599c7..08a3a351 100644
--- a/modelscope/trainers/nlp/csanmt_translation_trainer.py
+++ b/modelscope/trainers/nlp/csanmt_translation_trainer.py
@@ -1,6 +1,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
 import os.path as osp
+import time
 from typing import Dict, Optional
 
 import tensorflow as tf
@@ -122,8 +123,7 @@ class CsanmtTranslationTrainer(BaseTrainer):
         self.params['scale_l1'] = self.cfg['train']['scale_l1']
         self.params['scale_l2'] = self.cfg['train']['scale_l2']
         self.params['train_max_len'] = self.cfg['train']['train_max_len']
-        self.params['max_training_steps'] = self.cfg['train'][
-            'max_training_steps']
+        self.params['num_of_epochs'] = self.cfg['train']['num_of_epochs']
         self.params['save_checkpoints_steps'] = self.cfg['train'][
             'save_checkpoints_steps']
         self.params['num_of_samples'] = self.cfg['train']['num_of_samples']
@@ -144,14 +144,15 @@ class CsanmtTranslationTrainer(BaseTrainer):
         vocab_src = osp.join(self.model_dir, self.params['vocab_src'])
         vocab_trg = osp.join(self.model_dir, self.params['vocab_trg'])
 
+        epoch = 0
         iteration = 0
 
         with self._session.as_default() as tf_session:
             while True:
-                iteration += 1
-                if iteration >= self.params['max_training_steps']:
+                epoch += 1
+                if epoch >= self.params['num_of_epochs']:
                     break
-
+                tf.logging.info('%s: Epoch %i' % (__name__, epoch))
                 train_input_fn = input_fn(
                     train_src,
                     train_trg,
@@ -160,36 +161,44 @@ class CsanmtTranslationTrainer(BaseTrainer):
                     batch_size_words=self.params['train_batch_size_words'],
                     max_len=self.params['train_max_len'],
                     num_gpus=self.params['num_gpus']
-                    if self.params['num_gpus'] > 0 else 1,
+                    if self.params['num_gpus'] > 1 else 1,
                     is_train=True,
                     session=tf_session,
-                    iteration=iteration)
+                    epoch=epoch)
 
                 features, labels = train_input_fn
 
-                features_batch, labels_batch = tf_session.run(
-                    [features, labels])
-
-                feed_dict = {
-                    self.source_wids: features_batch,
-                    self.target_wids: labels_batch
-                }
-                sess_outputs = self._session.run(
-                    self.output, feed_dict=feed_dict)
-                loss_step = sess_outputs['loss']
-                logger.info('Iteration: {}, step loss: {:.6f}'.format(
-                    iteration, loss_step))
-
-                if iteration % self.params['save_checkpoints_steps'] == 0:
-                    tf.logging.info('%s: Saving model on step: %d.' %
-                                    (__name__, iteration))
-                    ck_path = self.model_dir + 'model.ckpt'
-                    self.model_saver.save(
-                        tf_session,
-                        ck_path,
-                        global_step=tf.train.get_global_step())
-
-        tf.logging.info('%s: NMT training completed at time: %s.')
+                try:
+                    while True:
+                        features_batch, labels_batch = tf_session.run(
+                            [features, labels])
+                        iteration += 1
+                        feed_dict = {
+                            self.source_wids: features_batch,
+                            self.target_wids: labels_batch
+                        }
+                        sess_outputs = self._session.run(
+                            self.output, feed_dict=feed_dict)
+                        loss_step = sess_outputs['loss']
+                        logger.info('Iteration: {}, step loss: {:.6f}'.format(
+                            iteration, loss_step))
+
+                        if iteration % self.params[
+                                'save_checkpoints_steps'] == 0:
+                            tf.logging.info('%s: Saving model on step: %d.' %
+                                            (__name__, iteration))
+                            ck_path = self.model_dir + 'model.ckpt'
+                            self.model_saver.save(
+                                tf_session,
+                                ck_path,
+                                global_step=tf.train.get_global_step())
+
+                except tf.errors.OutOfRangeError:
+                    tf.logging.info('epoch %d end!' % (epoch))
+
+            tf.logging.info(
+                '%s: NMT training completed at time: %s.' %
+                (__name__, time.asctime(time.localtime(time.time()))))
 
     def evaluate(self,
                  checkpoint_path: Optional[str] = None,
@@ -222,7 +231,7 @@ def input_fn(src_file,
              num_gpus=1,
              is_train=True,
              session=None,
-             iteration=None):
+             epoch=None):
     src_vocab = tf.lookup.StaticVocabularyTable(
         tf.lookup.TextFileInitializer(
             src_vocab_file,
@@ -291,7 +300,7 @@ def input_fn(src_file,
 
     if is_train:
         session.run(iterator.initializer)
-        if iteration == 1:
+        if epoch == 1:
             session.run(tf.tables_initializer())
     return features, labels
 
diff --git a/modelscope/trainers/nlp/space/eval.py b/modelscope/trainers/nlp/space/eval.py
index f315ff07..2db40cae 100644
--- a/modelscope/trainers/nlp/space/eval.py
+++ b/modelscope/trainers/nlp/space/eval.py
@@ -771,7 +771,8 @@ class CamRestEvaluator(GenericEvaluator):
     def get_entities(self, entity_path):
         entities_flat = []
         entitiy_to_slot_dict = {}
-        raw_entities = json.loads(open(entity_path).read().lower())
+        raw_entities = json.loads(
+            open(entity_path, encoding='utf-8').read().lower())
         for s in raw_entities['informable']:
             entities_flat.extend(raw_entities['informable'][s])
             for v in raw_entities['informable'][s]:
diff --git a/modelscope/utils/audio/audio_utils.py b/modelscope/utils/audio/audio_utils.py
index 32e2fa54..1ae5c8d2 100644
--- a/modelscope/utils/audio/audio_utils.py
+++ b/modelscope/utils/audio/audio_utils.py
@@ -47,7 +47,7 @@ def update_conf(origin_config_file, new_config_file, conf_item: [str, str]):
         else:
             return None
 
-    with open(origin_config_file) as f:
+    with open(origin_config_file, encoding='utf-8') as f:
         lines = f.readlines()
     with open(new_config_file, 'w') as f:
         for line in lines:
diff --git a/modelscope/utils/config.py b/modelscope/utils/config.py
index e46da7df..b3512251 100644
--- a/modelscope/utils/config.py
+++ b/modelscope/utils/config.py
@@ -178,7 +178,7 @@ class Config:
         if cfg_text:
             text = cfg_text
         elif filename:
-            with open(filename, 'r') as f:
+            with open(filename, 'r', encoding='utf-8') as f:
                 text = f.read()
         else:
             text = ''
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index f0a97dbd..b1bccc4c 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -80,6 +80,7 @@ class CVTasks(object):
     video_embedding = 'video-embedding'
     virtual_try_on = 'virtual-try-on'
     movie_scene_segmentation = 'movie-scene-segmentation'
+    language_guided_video_summarization = 'language-guided-video-summarization'
 
     # video segmentation
     referring_video_object_segmentation = 'referring-video-object-segmentation'
diff --git a/modelscope/utils/hub.py b/modelscope/utils/hub.py
index 105b3ffa..93cc20e2 100644
--- a/modelscope/utils/hub.py
+++ b/modelscope/utils/hub.py
@@ -124,7 +124,7 @@ def parse_label_mapping(model_dir):
     label2id = None
     label_path = os.path.join(model_dir, ModelFile.LABEL_MAPPING)
     if os.path.exists(label_path):
-        with open(label_path) as f:
+        with open(label_path, encoding='utf-8') as f:
             label_mapping = json.load(f)
         label2id = {name: idx for name, idx in label_mapping.items()}
 
diff --git a/modelscope/utils/nlp/space/clean_dataset.py b/modelscope/utils/nlp/space/clean_dataset.py
index 2c971b10..cbd0ebde 100644
--- a/modelscope/utils/nlp/space/clean_dataset.py
+++ b/modelscope/utils/nlp/space/clean_dataset.py
@@ -59,7 +59,9 @@ def clean_text(data_dir, text):
                   text)  # 'abc.xyz' -> 'abc . xyz'
     text = re.sub(r'(\w+)\.\.? ', r'\1 . ', text)  # if 'abc. ' -> 'abc . '
 
-    with open(os.path.join(data_dir, 'mapping.pair'), 'r') as fin:
+    with open(
+            os.path.join(data_dir, 'mapping.pair'), 'r',
+            encoding='utf-8') as fin:
         for line in fin.readlines():
             fromx, tox = line.replace('\n', '').split('\t')
             text = ' ' + text + ' '
diff --git a/modelscope/utils/nlp/space/db_ops.py b/modelscope/utils/nlp/space/db_ops.py
index d1d14ef9..27198b23 100644
--- a/modelscope/utils/nlp/space/db_ops.py
+++ b/modelscope/utils/nlp/space/db_ops.py
@@ -15,7 +15,9 @@ class MultiWozDB(object):
         self.dbs = {}
         self.sql_dbs = {}
         for domain in all_domains:
-            with open(os.path.join(db_dir, db_paths[domain]), 'r') as f:
+            with open(
+                    os.path.join(db_dir, db_paths[domain]), 'r',
+                    encoding='utf-8') as f:
                 self.dbs[domain] = json.loads(f.read().lower())
 
     def oneHotVector(self, domain, num):
diff --git a/modelscope/utils/nlp/space/utils.py b/modelscope/utils/nlp/space/utils.py
index 56e67671..70cb03a0 100644
--- a/modelscope/utils/nlp/space/utils.py
+++ b/modelscope/utils/nlp/space/utils.py
@@ -146,9 +146,9 @@ class MultiWOZVocab(object):
 
     def load_vocab(self, vocab_path):
         self._freq_dict = json.loads(
-            open(vocab_path + '.freq.json', 'r').read())
+            open(vocab_path + '.freq.json', 'r', encoding='utf-8').read())
         self._word2idx = json.loads(
-            open(vocab_path + '.word2idx.json', 'r').read())
+            open(vocab_path + '.word2idx.json', 'r', encoding='utf-8').read())
         self._idx2word = {}
         for w, idx in self._word2idx.items():
             self._idx2word[idx] = w
diff --git a/requirements/cv.txt b/requirements/cv.txt
index f29b296b..43eba7f9 100644
--- a/requirements/cv.txt
+++ b/requirements/cv.txt
@@ -1,5 +1,7 @@
 albumentations>=1.0.3
 av>=9.2.0
+bmt_clipit>=1.0
+clip>=1.0
 easydict
 fairscale>=0.4.1
 fastai>=1.0.51
@@ -19,6 +21,7 @@ moviepy>=1.0.3
 networkx>=2.5
 numba
 onnxruntime>=1.10
+opencv-python
 pai-easycv>=0.6.3.9
 pandas
 psutil
@@ -32,3 +35,4 @@ tf_slim
 timm>=0.4.9
 torchmetrics>=0.6.2
 torchvision
+videofeatures_clipit>=1.0
diff --git a/requirements/framework.txt b/requirements/framework.txt
index a86c0cc5..52601579 100644
--- a/requirements/framework.txt
+++ b/requirements/framework.txt
@@ -1,6 +1,6 @@
 addict
 attrs
-# version beyond 2.5.2 introduces compatbility issue and is being resolved
+# version beyond 2.5.2 introduces compatibility issue and is being resolved
 datasets<=2.5.2
 easydict
 einops
@@ -8,7 +8,6 @@ filelock>=3.3.0
 gast>=0.2.2
 jsonplus
 numpy
-opencv-python
 oss2
 Pillow>=6.2.0
 # for pyarrow 9.0.0 event_loop core dump
diff --git a/setup.py b/setup.py
index eff2f8ba..d709dadc 100644
--- a/setup.py
+++ b/setup.py
@@ -50,7 +50,7 @@ def get_hash():
 
 
 def get_version():
-    with open(version_file, 'r') as f:
+    with open(version_file, 'r', encoding='utf-8') as f:
         exec(compile(f.read(), version_file, 'exec'))
     return locals()['__version__']
 
@@ -109,7 +109,7 @@ def parse_requirements(fname='requirements.txt', with_version=True):
             yield info
 
     def parse_require_file(fpath):
-        with open(fpath, 'r') as f:
+        with open(fpath, 'r', encoding='utf-8') as f:
             for line in f.readlines():
                 line = line.strip()
                 if line.startswith('http'):
diff --git a/tests/pipelines/test_language_guided_video_summarization.py b/tests/pipelines/test_language_guided_video_summarization.py
new file mode 100755
index 00000000..0f06d4f2
--- /dev/null
+++ b/tests/pipelines/test_language_guided_video_summarization.py
@@ -0,0 +1,49 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+import shutil
+import tempfile
+import unittest
+
+import torch
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class LanguageGuidedVideoSummarizationTest(unittest.TestCase,
+                                           DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.language_guided_video_summarization
+        self.model_id = 'damo/cv_clip-it_video-summarization_language-guided_en'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub(self):
+        video_path = 'data/test/videos/video_category_test_video.mp4'
+        # input can be sentences such as sentences=['phone', 'hand'], or sentences=None
+        sentences = None
+        summarization_pipeline = pipeline(
+            Tasks.language_guided_video_summarization, model=self.model_id)
+        result = summarization_pipeline((video_path, sentences))
+
+        print(f'video summarization output: \n{result}.')
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_modelhub_default_model(self):
+        video_path = 'data/test/videos/video_category_test_video.mp4'
+        summarization_pipeline = pipeline(
+            Tasks.language_guided_video_summarization)
+        result = summarization_pipeline(video_path)
+
+        print(f'video summarization output:\n {result}.')
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_referring_video_object_segmentation.py b/tests/pipelines/test_referring_video_object_segmentation.py
index 4d8206b3..3e81d9c3 100644
--- a/tests/pipelines/test_referring_video_object_segmentation.py
+++ b/tests/pipelines/test_referring_video_object_segmentation.py
@@ -14,7 +14,7 @@ class ReferringVideoObjectSegmentationTest(unittest.TestCase,
         self.task = Tasks.referring_video_object_segmentation
         self.model_id = 'damo/cv_swin-t_referring_video-object-segmentation'
 
-    @unittest.skip('skip since the model is set to private for now')
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_referring_video_object_segmentation(self):
         input_location = 'data/test/videos/referring_video_object_segmentation_test_video.mp4'
         text_queries = [
@@ -31,7 +31,7 @@ class ReferringVideoObjectSegmentationTest(unittest.TestCase,
         else:
             raise ValueError('process error')
 
-    @unittest.skip('skip since the model is set to private for now')
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_referring_video_object_segmentation_with_default_task(self):
         input_location = 'data/test/videos/referring_video_object_segmentation_test_video.mp4'
         text_queries = [
diff --git a/tests/pipelines/test_table_question_answering.py b/tests/pipelines/test_table_question_answering.py
index 825d8f23..9faed993 100644
--- a/tests/pipelines/test_table_question_answering.py
+++ b/tests/pipelines/test_table_question_answering.py
@@ -24,13 +24,10 @@ def tableqa_tracking_and_print_results_with_history(
         'utterance': [
             '有哪些风险类型？',
             '风险类型有多少种？',
-            '珠江流域的小(2)型水库的库容总量是多少？',
+            '珠江流域的小型水库的库容总量是多少？',
             '那平均值是多少？',
             '那水库的名称呢？',
             '换成中型的呢？',
-            '枣庄营业厅的电话',
-            '那地址呢？',
-            '枣庄营业厅的电话和地址',
         ]
     }
     for p in pipelines:
@@ -55,9 +52,7 @@ def tableqa_tracking_and_print_results_without_history(
         'utterance': [
             '有哪些风险类型？',
             '风险类型有多少种？',
-            '珠江流域的小(2)型水库的库容总量是多少？',
-            '枣庄营业厅的电话',
-            '枣庄营业厅的电话和地址',
+            '珠江流域的小型水库的库容总量是多少？',
         ]
     }
     for p in pipelines:
@@ -77,13 +72,10 @@ def tableqa_tracking_and_print_results_with_tableid(
         'utterance': [
             ['有哪些风险类型？', 'fund'],
             ['风险类型有多少种？', 'reservoir'],
-            ['珠江流域的小(2)型水库的库容总量是多少？', 'reservoir'],
+            ['珠江流域的小型水库的库容总量是多少？', 'reservoir'],
             ['那平均值是多少？', 'reservoir'],
             ['那水库的名称呢？', 'reservoir'],
             ['换成中型的呢？', 'reservoir'],
-            ['枣庄营业厅的电话', 'business'],
-            ['那地址呢？', 'business'],
-            ['枣庄营业厅的电话和地址', 'business'],
         ],
     }
     for p in pipelines:
@@ -157,7 +149,7 @@ class TableQuestionAnswering(unittest.TestCase):
                     os.path.join(model.model_dir, 'databases'))
             ],
             syn_dict_file_path=os.path.join(model.model_dir, 'synonym.txt'),
-            is_use_sqlite=False)
+            is_use_sqlite=True)
         preprocessor = TableQuestionAnsweringPreprocessor(
             model_dir=model.model_dir, db=db)
         pipelines = [
diff --git a/tests/run.py b/tests/run.py
index b286ecb5..0759379f 100644
--- a/tests/run.py
+++ b/tests/run.py
@@ -247,7 +247,7 @@ def run_in_subprocess(args):
         test_suite_env_map[test_suite_file] = 'default'
 
     if args.run_config is not None and Path(args.run_config).exists():
-        with open(args.run_config) as f:
+        with open(args.run_config, encoding='utf-8') as f:
             run_config = yaml.load(f, Loader=yaml.FullLoader)
         if 'isolated' in run_config:
             isolated_cases = run_config['isolated']
diff --git a/tests/run_config.yaml b/tests/run_config.yaml
index d51e2606..faee2869 100644
--- a/tests/run_config.yaml
+++ b/tests/run_config.yaml
@@ -12,6 +12,7 @@ isolated:  # test cases that may require excessive anmount of GPU memory, which
   - test_segmentation_pipeline.py
   - test_movie_scene_segmentation.py
   - test_image_inpainting.py
+  - test_mglm_text_summarization.py
 
 envs:
   default: # default env, case not in other env will in default, pytorch.
diff --git a/tests/trainers/easycv/test_easycv_trainer.py b/tests/trainers/easycv/test_easycv_trainer.py
index 4bd63c55..5d714097 100644
--- a/tests/trainers/easycv/test_easycv_trainer.py
+++ b/tests/trainers/easycv/test_easycv_trainer.py
@@ -109,7 +109,7 @@ class EasyCVTrainerTestSingleGpu(unittest.TestCase):
         json_files = glob.glob(os.path.join(self.tmp_dir, '*.log.json'))
         self.assertEqual(len(json_files), 1)
 
-        with open(json_files[0], 'r') as f:
+        with open(json_files[0], 'r', encoding='utf-8') as f:
             lines = [i.strip() for i in f.readlines()]
 
         self.assertDictContainsSubset(
@@ -185,7 +185,7 @@ class EasyCVTrainerTestMultiGpus(DistributedTestCase):
         json_files = glob.glob(os.path.join(self.tmp_dir, '*.log.json'))
         self.assertEqual(len(json_files), 1)
 
-        with open(json_files[0], 'r') as f:
+        with open(json_files[0], 'r', encoding='utf-8') as f:
             lines = [i.strip() for i in f.readlines()]
 
         self.assertDictContainsSubset(
diff --git a/tests/trainers/test_image_denoise_trainer.py b/tests/trainers/test_image_denoise_trainer.py
index b742dcae..3b5882bd 100644
--- a/tests/trainers/test_image_denoise_trainer.py
+++ b/tests/trainers/test_image_denoise_trainer.py
@@ -62,7 +62,7 @@ class ImageDenoiseTrainerTest(unittest.TestCase):
         trainer.train()
         results_files = os.listdir(self.tmp_dir)
         self.assertIn(f'{trainer.timestamp}.log.json', results_files)
-        for i in range(2):
+        for i in range(1):
             self.assertIn(f'epoch_{i+1}.pth', results_files)
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
@@ -73,13 +73,13 @@ class ImageDenoiseTrainerTest(unittest.TestCase):
             model=model,
             train_dataset=self.dataset_train,
             eval_dataset=self.dataset_val,
-            max_epochs=2,
+            max_epochs=1,
             work_dir=self.tmp_dir)
         trainer = build_trainer(default_args=kwargs)
         trainer.train()
         results_files = os.listdir(self.tmp_dir)
         self.assertIn(f'{trainer.timestamp}.log.json', results_files)
-        for i in range(2):
+        for i in range(1):
             self.assertIn(f'epoch_{i+1}.pth', results_files)
 
 
diff --git a/tests/trainers/test_referring_video_object_segmentation_trainer.py b/tests/trainers/test_referring_video_object_segmentation_trainer.py
index 7b03eb4d..fb152954 100644
--- a/tests/trainers/test_referring_video_object_segmentation_trainer.py
+++ b/tests/trainers/test_referring_video_object_segmentation_trainer.py
@@ -7,8 +7,8 @@ import zipfile
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.metainfo import Trainers
-from modelscope.models.cv.movie_scene_segmentation import \
-    MovieSceneSegmentationModel
+from modelscope.models.cv.referring_video_object_segmentation import \
+    ReferringVideoObjectSegmentation
 from modelscope.msdatasets import MsDataset
 from modelscope.trainers import build_trainer
 from modelscope.utils.config import Config, ConfigDict
@@ -46,7 +46,6 @@ class TestImageInstanceSegmentationTrainer(unittest.TestCase):
             dataset_name=train_data_cfg.name,
             split=train_data_cfg.split,
             cfg=train_data_cfg.cfg,
-            namespace='damo',
             test_mode=train_data_cfg.test_mode)
         assert next(
             iter(self.train_dataset.config_kwargs['split_config'].values()))
@@ -55,14 +54,13 @@ class TestImageInstanceSegmentationTrainer(unittest.TestCase):
             dataset_name=test_data_cfg.name,
             split=test_data_cfg.split,
             cfg=test_data_cfg.cfg,
-            namespace='damo',
             test_mode=test_data_cfg.test_mode)
         assert next(
             iter(self.test_dataset.config_kwargs['split_config'].values()))
 
         self.max_epochs = max_epochs
 
-    @unittest.skip('skip since the model is set to private for now')
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_trainer(self):
         kwargs = dict(
             model=self.model_id,
@@ -77,11 +75,11 @@ class TestImageInstanceSegmentationTrainer(unittest.TestCase):
         results_files = os.listdir(trainer.work_dir)
         self.assertIn(f'{trainer.timestamp}.log.json', results_files)
 
-    @unittest.skip('skip since the model is set to private for now')
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_trainer_with_model_and_args(self):
 
         cache_path = snapshot_download(self.model_id)
-        model = MovieSceneSegmentationModel.from_pretrained(cache_path)
+        model = ReferringVideoObjectSegmentation.from_pretrained(cache_path)
         kwargs = dict(
             cfg_file=os.path.join(cache_path, ModelFile.CONFIGURATION),
             model=model,
diff --git a/tests/trainers/test_trainer.py b/tests/trainers/test_trainer.py
index c73a56a3..5d466ee0 100644
--- a/tests/trainers/test_trainer.py
+++ b/tests/trainers/test_trainer.py
@@ -248,7 +248,7 @@ class TrainerTest(unittest.TestCase):
         results_files = os.listdir(self.tmp_dir)
 
         json_file = os.path.join(self.tmp_dir, f'{trainer.timestamp}.log.json')
-        with open(json_file, 'r') as f:
+        with open(json_file, 'r', encoding='utf-8') as f:
             lines = [i.strip() for i in f.readlines()]
         self.assertDictContainsSubset(
             {
@@ -367,7 +367,7 @@ class TrainerTest(unittest.TestCase):
         trainer.train()
         results_files = os.listdir(self.tmp_dir)
         json_file = os.path.join(self.tmp_dir, f'{trainer.timestamp}.log.json')
-        with open(json_file, 'r') as f:
+        with open(json_file, 'r', encoding='utf-8') as f:
             lines = [i.strip() for i in f.readlines()]
         self.assertDictContainsSubset(
             {
diff --git a/tests/trainers/test_trainer_gpu.py b/tests/trainers/test_trainer_gpu.py
index 0176704a..c003f3c9 100644
--- a/tests/trainers/test_trainer_gpu.py
+++ b/tests/trainers/test_trainer_gpu.py
@@ -142,7 +142,7 @@ class TrainerTestSingleGpu(unittest.TestCase):
         json_files = glob.glob(os.path.join(self.tmp_dir, '*.log.json'))
         self.assertEqual(len(json_files), 1)
 
-        with open(json_files[0], 'r') as f:
+        with open(json_files[0], 'r', encoding='utf-8') as f:
             lines = [i.strip() for i in f.readlines()]
         self.assertDictContainsSubset(
             {
@@ -236,7 +236,7 @@ class TrainerTestMultiGpus(DistributedTestCase):
         json_files = glob.glob(os.path.join(self.tmp_dir, '*.log.json'))
         self.assertEqual(len(json_files), 1)
 
-        with open(json_files[0], 'r') as f:
+        with open(json_files[0], 'r', encoding='utf-8') as f:
             lines = [i.strip() for i in f.readlines()]
 
         self.assertDictContainsSubset(
@@ -320,7 +320,7 @@ class TrainerTestMultiGpus(DistributedTestCase):
         json_files = glob.glob(os.path.join(self.tmp_dir, '*.log.json'))
         self.assertEqual(len(json_files), 1)
 
-        with open(json_files[0], 'r') as f:
+        with open(json_files[0], 'r', encoding='utf-8') as f:
             lines = [i.strip() for i in f.readlines()]
 
         print(results_files, lines)
diff --git a/tests/trainers/test_translation_trainer.py b/tests/trainers/test_translation_trainer.py
index 71bed241..7be23145 100644
--- a/tests/trainers/test_translation_trainer.py
+++ b/tests/trainers/test_translation_trainer.py
@@ -6,11 +6,17 @@ from modelscope.utils.test_utils import test_level
 
 
 class TranslationTest(unittest.TestCase):
-    model_id = 'damo/nlp_csanmt_translation_zh2en'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
-    def test_run_with_model_name(self):
-        trainer = CsanmtTranslationTrainer(model=self.model_id)
+    def test_run_with_model_name_for_en2zh(self):
+        model_id = 'damo/nlp_csanmt_translation_en2zh'
+        trainer = CsanmtTranslationTrainer(model=model_id)
+        trainer.train()
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_name_for_en2fr(self):
+        model_id = 'damo/nlp_csanmt_translation_en2fr'
+        trainer = CsanmtTranslationTrainer(model=model_id)
         trainer.train()