From 202bd4c5a85db3b8f0ea8b30c09b55acf3b81022 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=A1=8C=E5=97=94?= <yichang.zyc@alibaba-inc.com>
Date: Wed, 24 Aug 2022 11:13:16 +0800
Subject: [PATCH 01/54] fix warning

---
 .../models/multi_modal/ofa/generate/search.py |   2 +-
 .../ofa/generate/sequence_generator.py        |  11 +-
 .../models/multi_modal/ofa/modeling_ofa.py    |   3 +-
 .../models/multi_modal/ofa/utils/constant.py  |   4 +-
 .../models/multi_modal/ofa_for_all_tasks.py   |  19 ++-
 modelscope/preprocessors/multi_modal.py       |  20 ++-
 modelscope/preprocessors/ofa/base.py          |   3 +-
 .../preprocessors/ofa/image_captioning.py     |  16 ++-
 .../preprocessors/ofa/image_classification.py |  14 +-
 modelscope/preprocessors/ofa/summarization.py |  10 +-
 .../preprocessors/ofa/text_classification.py  |  11 +-
 .../ofa/text_to_image_synthesis.py            |  10 +-
 modelscope/preprocessors/ofa/utils/collate.py |   7 +-
 .../preprocessors/ofa/visual_entailment.py    |  15 +-
 .../preprocessors/ofa/visual_grounding.py     |  15 +-
 .../ofa/visual_question_answering.py          |  14 +-
 .../trainers/multi_modal/ofa/__init__.py      |   0
 .../multi_modal/ofa/ofa_file_dataset.py       | 131 ++++++++++++++++++
 .../trainers/multi_modal/ofa/ofa_trainer.py   |   0
 .../multi_modal/ofa/ofa_trainer_utils.py      |  52 +++++++
 20 files changed, 294 insertions(+), 63 deletions(-)
 create mode 100644 modelscope/trainers/multi_modal/ofa/__init__.py
 create mode 100644 modelscope/trainers/multi_modal/ofa/ofa_file_dataset.py
 create mode 100644 modelscope/trainers/multi_modal/ofa/ofa_trainer.py
 create mode 100644 modelscope/trainers/multi_modal/ofa/ofa_trainer_utils.py

diff --git a/modelscope/models/multi_modal/ofa/generate/search.py b/modelscope/models/multi_modal/ofa/generate/search.py
index 63ecb0a9..0dcaf6b3 100644
--- a/modelscope/models/multi_modal/ofa/generate/search.py
+++ b/modelscope/models/multi_modal/ofa/generate/search.py
@@ -148,7 +148,7 @@ class BeamSearch(Search):
         scores_buf = top_prediction[0]
         indices_buf = top_prediction[1]
         # Project back into relative indices and beams
-        beams_buf = indices_buf // vocab_size
+        beams_buf = torch.div(indices_buf, vocab_size, rounding_mode='floor')
         indices_buf = indices_buf.fmod(vocab_size)
 
         # At this point, beams_buf and indices_buf are single-dim and contain relative indices
diff --git a/modelscope/models/multi_modal/ofa/generate/sequence_generator.py b/modelscope/models/multi_modal/ofa/generate/sequence_generator.py
index 590fb67b..9d427836 100644
--- a/modelscope/models/multi_modal/ofa/generate/sequence_generator.py
+++ b/modelscope/models/multi_modal/ofa/generate/sequence_generator.py
@@ -385,12 +385,7 @@ class SequenceGenerator(nn.Module):
                     attn = torch.empty(bsz * beam_size,
                                        avg_attn_scores.size(1),
                                        max_len + 2).to(scores)
-                    # print("+++++++ debug attention shape +++++++")
-                    # print("attn", attn.shape)
-                    # print("avg_attn_scores", avg_attn_scores.shape)
                 attn[:, :, step + 1].copy_(avg_attn_scores)
-                # print("attn[:, :, step + 1]", attn[:, :, step + 1].shape)
-                # print("attn", attn.shape)
 
             scores = scores.type_as(lprobs)
             eos_bbsz_idx = torch.empty(0).to(
@@ -403,7 +398,8 @@ class SequenceGenerator(nn.Module):
             if self.should_set_src_lengths:
                 self.search.set_src_lengths(src_lengths)
 
-            if self.repeat_ngram_blocker is not None:
+            if self.repeat_ngram_blocker is not None and step > prefix_tokens.size(
+                    1):
                 lprobs = self.repeat_ngram_blocker(tokens, lprobs, bsz,
                                                    beam_size, step)
 
@@ -415,7 +411,6 @@ class SequenceGenerator(nn.Module):
                 tokens[:, :step + 1],
                 original_batch_idxs,
             )
-
             # cand_bbsz_idx contains beam indices for the top candidate
             # hypotheses, with a range of values: [0, bsz*beam_size),
             # and dimensions: [bsz, cand_size]
@@ -671,7 +666,7 @@ class SequenceGenerator(nn.Module):
                 cum_unfin.append(prev)
         cum_fin_tensor = torch.tensor(cum_unfin, dtype=torch.int).to(bbsz_idx)
 
-        unfin_idx = bbsz_idx // beam_size
+        unfin_idx = torch.div(bbsz_idx, beam_size, rounding_mode='floor')
         sent = unfin_idx + torch.index_select(cum_fin_tensor, 0, unfin_idx)
 
         # Create a set of "{sent}{unfin_idx}", where
diff --git a/modelscope/models/multi_modal/ofa/modeling_ofa.py b/modelscope/models/multi_modal/ofa/modeling_ofa.py
index 01cc02f9..4de35741 100755
--- a/modelscope/models/multi_modal/ofa/modeling_ofa.py
+++ b/modelscope/models/multi_modal/ofa/modeling_ofa.py
@@ -114,7 +114,8 @@ def make_image_bucket_position(bucket_size, num_relative_distance):
     """
     coords_h = torch.arange(bucket_size)
     coords_w = torch.arange(bucket_size)
-    coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+    coords = torch.stack(torch.meshgrid([coords_h, coords_w],
+                                        indexing='ij'))  # 2, Wh, Ww
     coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
     relative_coords = coords_flatten[:, :, None] - \
         coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
diff --git a/modelscope/models/multi_modal/ofa/utils/constant.py b/modelscope/models/multi_modal/ofa/utils/constant.py
index 984da443..124afefa 100644
--- a/modelscope/models/multi_modal/ofa/utils/constant.py
+++ b/modelscope/models/multi_modal/ofa/utils/constant.py
@@ -7,7 +7,7 @@ OFA_TASK_KEY_MAPPING = {
     Tasks.summarization: OutputKeys.TEXT,
     Tasks.visual_question_answering: OutputKeys.TEXT,
     Tasks.visual_grounding: OutputKeys.BOXES,
-    Tasks.text_classification: (OutputKeys.SCORES, OutputKeys.LABELS),
+    Tasks.text_classification: OutputKeys.LABELS,
     Tasks.image_classification: OutputKeys.LABELS,
-    Tasks.visual_entailment: (OutputKeys.SCORES, OutputKeys.LABELS),
+    Tasks.visual_entailment: OutputKeys.LABELS,
 }
diff --git a/modelscope/models/multi_modal/ofa_for_all_tasks.py b/modelscope/models/multi_modal/ofa_for_all_tasks.py
index 860b68d3..80471e3c 100644
--- a/modelscope/models/multi_modal/ofa_for_all_tasks.py
+++ b/modelscope/models/multi_modal/ofa_for_all_tasks.py
@@ -127,10 +127,23 @@ class OfaForAllTasks(TorchModel):
         return input
 
     def _text_gen_inference(self, input):
+        import pdb
+        pdb.set_trace()
         input = move_to_device(input, self._device)
-        gen_output = self.generator.generate([self.model], input)
-        gen = [gen_output[i][0]['tokens'] for i in range(len(gen_output))]
-        result = self.tokenizer.batch_decode(gen, skip_special_tokens=True)
+        if 'prefix_tokens' in input:
+            gen_output = self.generator.generate(
+                [self.model], input, prefix_tokens=input['prefix_tokens'])
+        else:
+            gen_output = self.generator.generate([self.model], input)
+        gen_l = list()
+        for i in range(len(gen_output)):
+            if 'prefix_tokens' in input:
+                prefix_tokens = input['prefix_tokens']
+                gen_l.append(
+                    gen_output[i][0]['tokens'][len(prefix_tokens[i]):])
+            else:
+                gen_l.append(gen_output[i][0]['tokens'])
+        result = self.tokenizer.batch_decode(gen_l, skip_special_tokens=True)
         # text generation tasks have no score
         ret = {OFA_TASK_KEY_MAPPING[self.cfg.task]: result}
         if self.cfg.task.endswith('classification'):
diff --git a/modelscope/preprocessors/multi_modal.py b/modelscope/preprocessors/multi_modal.py
index 65578e6a..46648832 100644
--- a/modelscope/preprocessors/multi_modal.py
+++ b/modelscope/preprocessors/multi_modal.py
@@ -1,5 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
+from io import BytesIO
 from typing import Any, Dict, List, Union
 
 import torch
@@ -8,6 +9,7 @@ from PIL import Image
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.metainfo import Preprocessors
 from modelscope.pipelines.base import Input
+from modelscope.preprocessors.image import load_image
 from modelscope.utils.config import Config
 from modelscope.utils.constant import Fields, ModelFile, Tasks
 from .base import Preprocessor
@@ -71,20 +73,32 @@ class OfaPreprocessor(Preprocessor):
             data[key] = item
         return data
 
+    def _compatible_with_pretrain(self, data):
+        if 'image' in data and self.cfg.model.get('type', None) == 'ofa':
+            image = load_image(data['image'])
+            img_buffer = BytesIO()
+            image.save(img_buffer, format='JPEG')
+            data['image'] = Image.open(img_buffer)
+        return data
+
     def __call__(self, input: Union[str, tuple, Dict[str, Any]], *args,
                  **kwargs) -> Dict[str, Any]:
         if isinstance(input, dict):
             data = input
         else:
             data = self._build_dict(input)
+        data = self._compatible_with_pretrain(data)
         sample = self.preprocess(data)
         str_data = dict()
         for k, v in data.items():
             str_data[k] = str(v)
         sample['sample'] = str_data
-        return collate_fn([sample],
-                          pad_idx=self.tokenizer.pad_token_id,
-                          eos_idx=self.tokenizer.eos_token_id)
+        if kwargs.get('no_collate', None):
+            return sample
+        else:
+            return collate_fn([sample],
+                              pad_idx=self.tokenizer.pad_token_id,
+                              eos_idx=self.tokenizer.eos_token_id)
 
 
 @PREPROCESSORS.register_module(
diff --git a/modelscope/preprocessors/ofa/base.py b/modelscope/preprocessors/ofa/base.py
index fb9d06cd..69286f69 100644
--- a/modelscope/preprocessors/ofa/base.py
+++ b/modelscope/preprocessors/ofa/base.py
@@ -13,7 +13,7 @@ from .utils.random_help import set_torch_seed
 
 class OfaBasePreprocessor:
 
-    def __init__(self, cfg, model_dir):
+    def __init__(self, cfg, model_dir, split, *args, **kwargs):
         """preprocess the data via the vocab.txt from the `model_dir` path
 
         Args:
@@ -76,6 +76,7 @@ class OfaBasePreprocessor:
             text,
             max_length=self.max_src_length,
             add_special_tokens=False,
+            truncation=True,
             return_tensors='pt')['input_ids'].squeeze(0)
         if add_bos:
             inputs = torch.cat([self.bos_item, inputs])
diff --git a/modelscope/preprocessors/ofa/image_captioning.py b/modelscope/preprocessors/ofa/image_captioning.py
index 264c8e04..3ea4ccb2 100644
--- a/modelscope/preprocessors/ofa/image_captioning.py
+++ b/modelscope/preprocessors/ofa/image_captioning.py
@@ -6,24 +6,28 @@ from PIL import Image
 from torchvision import transforms
 
 from modelscope.preprocessors.image import load_image
+from modelscope.utils.constant import ModeKeys
 from .base import OfaBasePreprocessor
 
 
 class OfaImageCaptioningPreprocessor(OfaBasePreprocessor):
 
-    def __init__(self, cfg, model_dir):
-        """preprocess the data via the vocab.txt from the `model_dir` path
+    def __init__(self, cfg, model_dir, split, *args, **kwargs):
+        """preprocess the data
 
         Args:
             cfg(modelscope.utils.config.ConfigDict) : model config
-            model_dir (str): model path
+            model_dir (str): model path,
+            split: data phase
         """
-        super(OfaImageCaptioningPreprocessor, self).__init__(cfg, model_dir)
+        super(OfaImageCaptioningPreprocessor,
+              self).__init__(cfg, model_dir, split, *args, **kwargs)
         # Initialize transform
         self.patch_resize_transform = transforms.Compose([
             lambda image: image.convert('RGB'),
-            transforms.Resize((self.patch_image_size, self.patch_image_size),
-                              interpolation=Image.BICUBIC),
+            transforms.Resize(
+                (self.patch_image_size, self.patch_image_size),
+                interpolation=transforms.InterpolationMode.BICUBIC),
             transforms.ToTensor(),
             transforms.Normalize(mean=self.mean, std=self.std),
         ])
diff --git a/modelscope/preprocessors/ofa/image_classification.py b/modelscope/preprocessors/ofa/image_classification.py
index 30289613..a0cd0990 100644
--- a/modelscope/preprocessors/ofa/image_classification.py
+++ b/modelscope/preprocessors/ofa/image_classification.py
@@ -11,20 +11,22 @@ from .base import OfaBasePreprocessor
 
 class OfaImageClassificationPreprocessor(OfaBasePreprocessor):
 
-    def __init__(self, cfg, model_dir):
-        """preprocess the data via the vocab.txt from the `model_dir` path
+    def __init__(self, cfg, model_dir, split, *args, **kwargs):
+        """preprocess the data
 
         Args:
             cfg(modelscope.utils.config.ConfigDict) : model config
-            model_dir (str): model path
+            model_dir (str): model path,
+            split: data phase
         """
         super(OfaImageClassificationPreprocessor,
-              self).__init__(cfg, model_dir)
+              self).__init__(cfg, model_dir, split, *args, **kwargs)
         # Initialize transform
         self.patch_resize_transform = transforms.Compose([
             lambda image: image.convert('RGB'),
-            transforms.Resize((self.patch_image_size, self.patch_image_size),
-                              interpolation=Image.BICUBIC),
+            transforms.Resize(
+                (self.patch_image_size, self.patch_image_size),
+                interpolation=transforms.InterpolationMode.BICUBIC),
             transforms.ToTensor(),
             transforms.Normalize(mean=self.mean, std=self.std),
         ])
diff --git a/modelscope/preprocessors/ofa/summarization.py b/modelscope/preprocessors/ofa/summarization.py
index fd5113cd..00ae9bf9 100644
--- a/modelscope/preprocessors/ofa/summarization.py
+++ b/modelscope/preprocessors/ofa/summarization.py
@@ -6,14 +6,16 @@ from .base import OfaBasePreprocessor
 
 class OfaSummarizationPreprocessor(OfaBasePreprocessor):
 
-    def __init__(self, cfg, model_dir):
-        """preprocess the data via the vocab.txt from the `model_dir` path
+    def __init__(self, cfg, model_dir, split, *args, **kwargs):
+        """preprocess the data
 
         Args:
             cfg(modelscope.utils.config.ConfigDict) : model config
-            model_dir (str): model path
+            model_dir (str): model path,
+            split: data phase
         """
-        super(OfaSummarizationPreprocessor, self).__init__(cfg, model_dir)
+        super(OfaSummarizationPreprocessor,
+              self).__init__(cfg, model_dir, split, *args, **kwargs)
 
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
         source = super().pre_caption(
diff --git a/modelscope/preprocessors/ofa/text_classification.py b/modelscope/preprocessors/ofa/text_classification.py
index 1a3f84fd..25981e65 100644
--- a/modelscope/preprocessors/ofa/text_classification.py
+++ b/modelscope/preprocessors/ofa/text_classification.py
@@ -6,14 +6,16 @@ from .base import OfaBasePreprocessor
 
 class OfaTextClassificationPreprocessor(OfaBasePreprocessor):
 
-    def __init__(self, cfg, model_dir):
-        """preprocess the data via the vocab.txt from the `model_dir` path
+    def __init__(self, cfg, model_dir, split, *args, **kwargs):
+        """preprocess the data
 
         Args:
             cfg(modelscope.utils.config.ConfigDict) : model config
-            model_dir (str): model path
+            model_dir (str): model path,
+            split: data phase
         """
-        super(OfaTextClassificationPreprocessor, self).__init__(cfg, model_dir)
+        super(OfaTextClassificationPreprocessor,
+              self).__init__(cfg, model_dir, split, *args, **kwargs)
 
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
         text1 = ' '.join(
@@ -34,5 +36,6 @@ class OfaTextClassificationPreprocessor(OfaBasePreprocessor):
         sample = {
             'source': inputs,
             'decoder_prompt': decoder_prompt,
+            'prefix_token': decoder_prompt[:-1],
         }
         return sample
diff --git a/modelscope/preprocessors/ofa/text_to_image_synthesis.py b/modelscope/preprocessors/ofa/text_to_image_synthesis.py
index 9dbba921..56198e67 100644
--- a/modelscope/preprocessors/ofa/text_to_image_synthesis.py
+++ b/modelscope/preprocessors/ofa/text_to_image_synthesis.py
@@ -8,14 +8,16 @@ from .base import OfaBasePreprocessor
 
 class OfaTextToImageSynthesisPreprocessor(OfaBasePreprocessor):
 
-    def __init__(self, cfg, model_dir):
-        """preprocess the data via the vocab.txt from the `model_dir` path
+    def __init__(self, cfg, model_dir, split, *args, **kwargs):
+        """preprocess the data
 
         Args:
-            model_dir (str): model path
+            cfg(modelscope.utils.config.ConfigDict) : model config
+            model_dir (str): model path,
+            split: data phase
         """
         super(OfaTextToImageSynthesisPreprocessor,
-              self).__init__(cfg, model_dir)
+              self).__init__(cfg, model_dir, split, *args, **kwargs)
         self.max_src_length = 64
 
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
diff --git a/modelscope/preprocessors/ofa/utils/collate.py b/modelscope/preprocessors/ofa/utils/collate.py
index a473335b..82258e8b 100644
--- a/modelscope/preprocessors/ofa/utils/collate.py
+++ b/modelscope/preprocessors/ofa/utils/collate.py
@@ -50,8 +50,11 @@ def collate_fn(samples, pad_idx, eos_idx):
     if samples[0].get('constraint_mask', None) is not None:
         batch['constraint_masks'] = merge('constraint_mask')
     if samples[0].get('decoder_prompt', None) is not None:
-        batch['decoder_prompts'] = np.array(
-            [s['decoder_prompt'].tolist() for s in samples])
+        batch['decoder_prompts'] = torch.stack(
+            [s['decoder_prompt'] for s in samples], dim=0)
+    if samples[0].get('prefix_token', None) is not None:
+        batch['prefix_tokens'] = torch.stack(
+            [s['prefix_token'] for s in samples], dim=0)
     # For detection and visual grounding
     if samples[0].get('w_resize_ratio', None) is not None:
         batch['w_resize_ratios'] = torch.stack(
diff --git a/modelscope/preprocessors/ofa/visual_entailment.py b/modelscope/preprocessors/ofa/visual_entailment.py
index 72e88d75..45c719b1 100644
--- a/modelscope/preprocessors/ofa/visual_entailment.py
+++ b/modelscope/preprocessors/ofa/visual_entailment.py
@@ -11,19 +11,22 @@ from .base import OfaBasePreprocessor
 
 class OfaVisualEntailmentPreprocessor(OfaBasePreprocessor):
 
-    def __init__(self, cfg, model_dir):
-        """preprocess the data via the vocab.txt from the `model_dir` path
+    def __init__(self, cfg, model_dir, split, *args, **kwargs):
+        """preprocess the data
 
         Args:
             cfg(modelscope.utils.config.ConfigDict) : model config
-            model_dir (str): model path
+            model_dir (str): model path,
+            split: data phase
         """
-        super(OfaVisualEntailmentPreprocessor, self).__init__(cfg, model_dir)
+        super(OfaVisualEntailmentPreprocessor,
+              self).__init__(cfg, model_dir, split, *args, **kwargs)
         # Initialize transform
         self.patch_resize_transform = transforms.Compose([
             lambda image: image.convert('RGB'),
-            transforms.Resize((self.patch_image_size, self.patch_image_size),
-                              interpolation=Image.BICUBIC),
+            transforms.Resize(
+                (self.patch_image_size, self.patch_image_size),
+                interpolation=transforms.InterpolationMode.BICUBIC),
             transforms.ToTensor(),
             transforms.Normalize(mean=self.mean, std=self.std),
         ])
diff --git a/modelscope/preprocessors/ofa/visual_grounding.py b/modelscope/preprocessors/ofa/visual_grounding.py
index eebc4cf2..eaaed0ef 100644
--- a/modelscope/preprocessors/ofa/visual_grounding.py
+++ b/modelscope/preprocessors/ofa/visual_grounding.py
@@ -11,19 +11,22 @@ from .base import OfaBasePreprocessor
 
 class OfaVisualGroundingPreprocessor(OfaBasePreprocessor):
 
-    def __init__(self, cfg, model_dir):
-        """preprocess the data via the vocab.txt from the `model_dir` path
+    def __init__(self, cfg, model_dir, split, *args, **kwargs):
+        """preprocess the data
 
         Args:
             cfg(modelscope.utils.config.ConfigDict) : model config
-            model_dir (str): model path
+            model_dir (str): model path,
+            split: data phase
         """
-        super(OfaVisualGroundingPreprocessor, self).__init__(cfg, model_dir)
+        super(OfaVisualGroundingPreprocessor,
+              self).__init__(cfg, model_dir, split, *args, **kwargs)
         # Initialize transform
         self.patch_resize_transform = transforms.Compose([
             lambda image: image.convert('RGB'),
-            transforms.Resize((self.patch_image_size, self.patch_image_size),
-                              interpolation=Image.BICUBIC),
+            transforms.Resize(
+                (self.patch_image_size, self.patch_image_size),
+                interpolation=transforms.InterpolationMode.BICUBIC),
             transforms.ToTensor(),
             transforms.Normalize(mean=self.mean, std=self.std),
         ])
diff --git a/modelscope/preprocessors/ofa/visual_question_answering.py b/modelscope/preprocessors/ofa/visual_question_answering.py
index b11af9f6..bce18c95 100644
--- a/modelscope/preprocessors/ofa/visual_question_answering.py
+++ b/modelscope/preprocessors/ofa/visual_question_answering.py
@@ -11,20 +11,22 @@ from .base import OfaBasePreprocessor
 
 class OfaVisualQuestionAnsweringPreprocessor(OfaBasePreprocessor):
 
-    def __init__(self, cfg, model_dir):
-        """preprocess the data via the vocab.txt from the `model_dir` path
+    def __init__(self, cfg, model_dir, split, *args, **kwargs):
+        """preprocess the data
 
         Args:
             cfg(modelscope.utils.config.ConfigDict) : model config
-            model_dir (str): model path
+            model_dir (str): model path,
+            split: data phase
         """
         super(OfaVisualQuestionAnsweringPreprocessor,
-              self).__init__(cfg, model_dir)
+              self).__init__(cfg, model_dir, split, *args, **kwargs)
         # Initialize transform
         self.patch_resize_transform = transforms.Compose([
             lambda image: image.convert('RGB'),
-            transforms.Resize((self.patch_image_size, self.patch_image_size),
-                              interpolation=Image.BICUBIC),
+            transforms.Resize(
+                (self.patch_image_size, self.patch_image_size),
+                interpolation=transforms.InterpolationMode.BICUBIC),
             transforms.ToTensor(),
             transforms.Normalize(mean=self.mean, std=self.std),
         ])
diff --git a/modelscope/trainers/multi_modal/ofa/__init__.py b/modelscope/trainers/multi_modal/ofa/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/trainers/multi_modal/ofa/ofa_file_dataset.py b/modelscope/trainers/multi_modal/ofa/ofa_file_dataset.py
new file mode 100644
index 00000000..2f64f9ff
--- /dev/null
+++ b/modelscope/trainers/multi_modal/ofa/ofa_file_dataset.py
@@ -0,0 +1,131 @@
+# Copyright 2022 The OFA-Sys Team.
+# All rights reserved.
+# This source code is licensed under the Apache 2.0 license
+# found in the LICENSE file in the root directory.
+
+import os
+import pickle
+
+import torch
+
+
+class OFAFileDataset:
+
+    def __init__(self,
+                 file_path,
+                 selected_col_ids=None,
+                 dtypes=None,
+                 separator='\t',
+                 cached_index=False):
+        self.file_path = file_path
+        assert os.path.exists(
+            self.file_path), 'Error: The local datafile {} not exists!'.format(
+                self.file_path)
+
+        self.separator = separator
+        if selected_col_ids is None:
+            # default to all fields
+            self.selected_col_ids = list(
+                range(
+                    len(
+                        open(self.file_path).readline().rstrip('\n').split(
+                            self.separator))))
+        else:
+            self.selected_col_ids = [
+                int(col_id) for col_id in selected_col_ids.split(',')
+            ]
+        if dtypes is None:
+            # default to str
+            self.dtypes = [str for col_id in self.selected_col_ids]
+        else:
+            self.dtypes = [eval(col_dtype) for col_dtype in dtypes.split(',')]
+            assert len(self.dtypes) == len(self.selected_col_ids)
+
+        self.data_cnt = 0
+        try:
+            self.slice_id = torch.distributed.get_rank()
+            self.slice_count = torch.distributed.get_world_size()
+        except Exception:
+            self.slice_id = 0
+            self.slice_count = 1
+        self.cached_index = cached_index
+        self._init_seek_index()
+        self._reader = self._get_reader()
+        print('file {} slice_id {} row count {} total row count {}'.format(
+            self.file_path, self.slice_id, self.row_count,
+            self.total_row_count))
+
+    def _init_seek_index(self):
+        if self.cached_index:
+            cache_path = '{}.index'.format(self.file_path)
+            assert os.path.exists(
+                cache_path), 'cache file {} not exists!'.format(cache_path)
+            self.total_row_count, self.lineid_to_offset = pickle.load(
+                open(cache_path, 'rb'))
+            print(
+                'local datafile {} slice_id {} use cached row_count and line_idx-to-offset mapping'
+                .format(self.file_path, self.slice_id))
+        else:
+            # make an iteration over the file to get row_count and line_idx-to-offset mapping
+            fp = open(self.file_path, 'r')
+            print(
+                'local datafile {} slice_id {} begin to initialize row_count and line_idx-to-offset mapping'
+                .format(self.file_path, self.slice_id))
+            self.total_row_count = 0
+            offset = 0
+            self.lineid_to_offset = []
+            for line in fp:
+                self.lineid_to_offset.append(offset)
+                self.total_row_count += 1
+                offset += len(line.encode('utf-8'))
+        self._compute_start_pos_and_row_count()
+        print(
+            'local datafile {} slice_id {} finished initializing row_count and line_idx-to-offset mapping'
+            .format(self.file_path, self.slice_id))
+
+    def _compute_start_pos_and_row_count(self):
+        self.row_count = self.total_row_count // self.slice_count
+        if self.slice_id < self.total_row_count - self.row_count * self.slice_count:
+            self.row_count += 1
+            self.start_pos = self.row_count * self.slice_id
+        else:
+            self.start_pos = self.row_count * self.slice_id + (
+                self.total_row_count - self.row_count * self.slice_count)
+
+    def _get_reader(self):
+        fp = open(self.file_path, 'r')
+        fp.seek(self.lineid_to_offset[self.start_pos])
+        return fp
+
+    def _seek(self, offset=0):
+        try:
+            print('slice_id {} seek offset {}'.format(self.slice_id,
+                                                      self.start_pos + offset))
+            self._reader.seek(self.lineid_to_offset[self.start_pos + offset])
+            self.data_cnt = offset
+        except Exception:
+            print('slice_id {} seek offset {}'.format(self.slice_id, offset))
+            self._reader.seek(self.lineid_to_offset[offset])
+            self.data_cnt = offset
+
+    def __del__(self):
+        self._reader.close()
+
+    def __len__(self):
+        return self.row_count
+
+    def get_total_row_count(self):
+        return self.total_row_count
+
+    def __getitem__(self, index):
+        if self.data_cnt == self.row_count:
+            print('reach the end of datafile, start a new reader')
+            self.data_cnt = 0
+            self._reader = self._get_reader()
+        column_l = self._reader.readline().rstrip('\n').split(self.separator)
+        self.data_cnt += 1
+        column_l = [
+            dtype(column_l[col_id])
+            for col_id, dtype in zip(self.selected_col_ids, self.dtypes)
+        ]
+        return column_l
diff --git a/modelscope/trainers/multi_modal/ofa/ofa_trainer.py b/modelscope/trainers/multi_modal/ofa/ofa_trainer.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/trainers/multi_modal/ofa/ofa_trainer_utils.py b/modelscope/trainers/multi_modal/ofa/ofa_trainer_utils.py
new file mode 100644
index 00000000..92a22bb4
--- /dev/null
+++ b/modelscope/trainers/multi_modal/ofa/ofa_trainer_utils.py
@@ -0,0 +1,52 @@
+# Copyright 2022 The OFA-Sys Team.
+# All rights reserved.
+# This source code is licensed under the Apache 2.0 license
+# found in the LICENSE file in the root directory.
+from os import path as osp
+
+from torch.utils.data import Dataset
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.preprocessors.multi_modal import OfaPreprocessor
+from modelscope.utils.config import Config
+from modelscope.utils.constant import Fields, ModeKeys, ModelFile, Tasks
+from .ofa_file_dataset import OFAFileDataset
+
+
+class OFADataset(Dataset):
+
+    def __init__(self,
+                 model_dir,
+                 file_path,
+                 dtypes=None,
+                 separator='\t',
+                 cached_index=False,
+                 split=ModeKeys.TRAIN,
+                 **kwargs):
+        self.cfg = Config.from_file(
+            osp.join(model_dir, ModelFile.CONFIGURATION))
+        selected_col_ids = self.cfg.dataset.selected_col_ids
+        selected_col_keys = self.cfg.dataset.selected_col_keys
+
+        assert selected_col_ids is not None
+        assert selected_col_keys is not None
+        self.selected_col_key_l = selected_col_keys.split(',')
+        assert len(self.selected_col_key_l) == len(selected_col_ids.split(','))
+
+        self.dataset = OFAFileDataset(
+            file_path=file_path,
+            selected_col_ids=selected_col_ids,
+            dtypes=dtypes,
+            separator=separator,
+            cached_index=cached_index)
+        self.preprocessor = OfaPreprocessor(model_dir, split)
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def __getitem__(self, index):
+        value_l = self.dataset[index]
+        data = dict()
+        for key, value in zip(self.selected_col_key_l, value_l):
+            data[key] = value
+        return self.preprocessor(data)

From a3aee4bec2318e38d36896228ac6c385537f68e5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=A1=8C=E5=97=94?= <yichang.zyc@alibaba-inc.com>
Date: Thu, 1 Sep 2022 18:02:07 +0800
Subject: [PATCH 02/54] test finetune

---
 modelscope/metainfo.py                        |   1 +
 .../ofa/generate/sequence_generator.py        |  25 +-
 .../models/multi_modal/ofa/modeling_ofa.py    |  10 +-
 modelscope/preprocessors/multi_modal.py       |  13 +-
 modelscope/preprocessors/ofa/base.py          |   3 +-
 .../preprocessors/ofa/image_captioning.py     |  11 +-
 .../preprocessors/ofa/image_classification.py |  12 +-
 modelscope/preprocessors/ofa/summarization.py |  12 +-
 .../preprocessors/ofa/text_classification.py  |  12 +-
 .../ofa/text_to_image_synthesis.py            |  12 +-
 .../preprocessors/ofa/visual_entailment.py    |  12 +-
 .../preprocessors/ofa/visual_grounding.py     |  12 +-
 .../ofa/visual_question_answering.py          |  12 +-
 .../trainers/multi_modal/ofa/__init__.py      |   1 +
 .../multi_modal/ofa/ofa_file_dataset.py       |   2 +
 .../trainers/multi_modal/ofa/ofa_trainer.py   | 120 ++++
 .../multi_modal/ofa/ofa_trainer_utils.py      | 302 +++++++-
 modelscope/utils/multi_modal/fp16/__init__.py |  14 +
 modelscope/utils/multi_modal/fp16/fp16.py     | 655 ++++++++++++++++++
 modelscope/utils/multi_modal/fp16/fp16util.py | 216 ++++++
 .../utils/multi_modal/fp16/loss_scaler.py     | 237 +++++++
 tests/pipelines/test_ofa_tasks.py             |   1 +
 tests/trainers/test_ofa_trainer.py            |  20 +
 23 files changed, 1661 insertions(+), 54 deletions(-)
 create mode 100644 modelscope/utils/multi_modal/fp16/__init__.py
 create mode 100755 modelscope/utils/multi_modal/fp16/fp16.py
 create mode 100644 modelscope/utils/multi_modal/fp16/fp16util.py
 create mode 100755 modelscope/utils/multi_modal/fp16/loss_scaler.py
 create mode 100644 tests/trainers/test_ofa_trainer.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 0bc16026..e344fbe7 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -164,6 +164,7 @@ class Trainers(object):
 
     # multi-modal trainers
     clip_multi_modal_embedding = 'clip-multi-modal-embedding'
+    ofa_tasks = 'ofa-tasks-trainer'
 
     # cv trainers
     image_instance_segmentation = 'image-instance-segmentation'
diff --git a/modelscope/models/multi_modal/ofa/generate/sequence_generator.py b/modelscope/models/multi_modal/ofa/generate/sequence_generator.py
index 9d427836..15d19e2c 100644
--- a/modelscope/models/multi_modal/ofa/generate/sequence_generator.py
+++ b/modelscope/models/multi_modal/ofa/generate/sequence_generator.py
@@ -398,10 +398,27 @@ class SequenceGenerator(nn.Module):
             if self.should_set_src_lengths:
                 self.search.set_src_lengths(src_lengths)
 
-            if self.repeat_ngram_blocker is not None and step > prefix_tokens.size(
-                    1):
-                lprobs = self.repeat_ngram_blocker(tokens, lprobs, bsz,
-                                                   beam_size, step)
+            if self.repeat_ngram_blocker is not None:
+                # process prefix_tokens
+                p_toks_len = prefix_tokens.ne(self.pad).sum(
+                    dim=1) if prefix_tokens is not None else None
+                if p_toks_len is not None:
+                    p_toks_len_beam = p_toks_len.unsqueeze(-1).repeat(
+                        1, beam_size).view(-1)
+                    no_repeat_ngram_size = self.repeat_ngram_blocker.no_repeat_ngram_size
+                    out_prefix = p_toks_len_beam < (
+                        step + no_repeat_ngram_size - 1)
+                else:
+                    out_prefix = [True] * bsz * beam_size
+                ngram_blocker_tokens = tokens[out_prefix]
+                ngram_blocker_lprobs = lprobs[out_prefix]
+                ngram_blocker_bsz = out_prefix.sum() // beam_size
+                lprobs[out_prefix] = self.repeat_ngram_blocker(
+                    tokens=ngram_blocker_tokens,
+                    lprobs=ngram_blocker_lprobs,
+                    bsz=ngram_blocker_bsz,
+                    beam_size=beam_size,
+                    step=step)
 
             # Shape: (batch, cand_size)
             cand_scores, cand_indices, cand_beams = self.search.step(
diff --git a/modelscope/models/multi_modal/ofa/modeling_ofa.py b/modelscope/models/multi_modal/ofa/modeling_ofa.py
index 4de35741..bc749b46 100755
--- a/modelscope/models/multi_modal/ofa/modeling_ofa.py
+++ b/modelscope/models/multi_modal/ofa/modeling_ofa.py
@@ -19,6 +19,7 @@ from dataclasses import dataclass
 from typing import Dict, List, Optional, Tuple
 
 import torch
+from packaging import version
 from torch import Tensor, nn
 from torch.nn import functional as F
 from transformers.activations import ACT2FN
@@ -40,6 +41,8 @@ logger = logging.get_logger(__name__)
 _CHECKPOINT_FOR_DOC = 'ofa-base'
 _CONFIG_FOR_DOC = 'OFAConfig'
 _TOKENIZER_FOR_DOC = 'OFATokenizer'
+TORCH_VERSION = version.parse(torch.__version__)
+TORCH_MESH_GRID_WARNING_VERSION = version.parse('1.9.1')
 
 DEFAULT_MAX_SOURCE_POSITIONS = 1024
 DEFAULT_MAX_TARGET_POSITIONS = 1024
@@ -114,8 +117,11 @@ def make_image_bucket_position(bucket_size, num_relative_distance):
     """
     coords_h = torch.arange(bucket_size)
     coords_w = torch.arange(bucket_size)
-    coords = torch.stack(torch.meshgrid([coords_h, coords_w],
-                                        indexing='ij'))  # 2, Wh, Ww
+    if TORCH_VERSION > TORCH_MESH_GRID_WARNING_VERSION:
+        coords = torch.stack(
+            torch.meshgrid([coords_h, coords_w], indexing='ij'))  # 2, Wh, Ww
+    else:
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))
     coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
     relative_coords = coords_flatten[:, :, None] - \
         coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
diff --git a/modelscope/preprocessors/multi_modal.py b/modelscope/preprocessors/multi_modal.py
index 5046e166..7a7b5854 100644
--- a/modelscope/preprocessors/multi_modal.py
+++ b/modelscope/preprocessors/multi_modal.py
@@ -11,7 +11,7 @@ from modelscope.metainfo import Preprocessors
 from modelscope.pipelines.base import Input
 from modelscope.preprocessors.image import load_image
 from modelscope.utils.config import Config
-from modelscope.utils.constant import Fields, ModelFile, Tasks
+from modelscope.utils.constant import Fields, ModeKeys, ModelFile, Tasks
 from .base import Preprocessor
 from .builder import PREPROCESSORS
 from .ofa import *  # noqa
@@ -27,11 +27,16 @@ __all__ = [
     Fields.multi_modal, module_name=Preprocessors.ofa_tasks_preprocessor)
 class OfaPreprocessor(Preprocessor):
 
-    def __init__(self, model_dir: str, *args, **kwargs):
+    def __init__(self,
+                 model_dir: str,
+                 mode=ModeKeys.INFERENCE,
+                 *args,
+                 **kwargs):
         """preprocess the data
 
         Args:
             model_dir (str): model path
+            mode: preprocessor mode (model mode)
         """
         super().__init__(*args, **kwargs)
         preprocess_mapping = {
@@ -59,8 +64,8 @@ class OfaPreprocessor(Preprocessor):
             model_dir)
         self.cfg = Config.from_file(
             osp.join(model_dir, ModelFile.CONFIGURATION))
-        self.preprocess = preprocess_mapping[self.cfg.task](self.cfg,
-                                                            model_dir)
+        self.preprocess = preprocess_mapping[self.cfg.task](
+            cfg=self.cfg, model_dir=model_dir, mode=mode)
         self.keys = input_key_mapping[self.cfg.task]
         self.tokenizer = self.preprocess.tokenizer
 
diff --git a/modelscope/preprocessors/ofa/base.py b/modelscope/preprocessors/ofa/base.py
index 69286f69..bb47c411 100644
--- a/modelscope/preprocessors/ofa/base.py
+++ b/modelscope/preprocessors/ofa/base.py
@@ -13,7 +13,7 @@ from .utils.random_help import set_torch_seed
 
 class OfaBasePreprocessor:
 
-    def __init__(self, cfg, model_dir, split, *args, **kwargs):
+    def __init__(self, cfg, model_dir, mode, *args, **kwargs):
         """preprocess the data via the vocab.txt from the `model_dir` path
 
         Args:
@@ -21,6 +21,7 @@ class OfaBasePreprocessor:
             model_dir (str): model path
         """
         self.cfg = cfg
+        self.mode = mode
         self.language = self.cfg.model.get('language', 'en')
         if self.language == 'en':
             tokenizer = OFATokenizer.from_pretrained(model_dir)
diff --git a/modelscope/preprocessors/ofa/image_captioning.py b/modelscope/preprocessors/ofa/image_captioning.py
index 3ea4ccb2..884e5ff8 100644
--- a/modelscope/preprocessors/ofa/image_captioning.py
+++ b/modelscope/preprocessors/ofa/image_captioning.py
@@ -12,16 +12,21 @@ from .base import OfaBasePreprocessor
 
 class OfaImageCaptioningPreprocessor(OfaBasePreprocessor):
 
-    def __init__(self, cfg, model_dir, split, *args, **kwargs):
+    def __init__(self,
+                 cfg,
+                 model_dir,
+                 mode=ModeKeys.INFERENCE,
+                 *args,
+                 **kwargs):
         """preprocess the data
 
         Args:
             cfg(modelscope.utils.config.ConfigDict) : model config
             model_dir (str): model path,
-            split: data phase
+            mode: preprocessor mode (model mode)
         """
         super(OfaImageCaptioningPreprocessor,
-              self).__init__(cfg, model_dir, split, *args, **kwargs)
+              self).__init__(cfg, model_dir, mode, *args, **kwargs)
         # Initialize transform
         self.patch_resize_transform = transforms.Compose([
             lambda image: image.convert('RGB'),
diff --git a/modelscope/preprocessors/ofa/image_classification.py b/modelscope/preprocessors/ofa/image_classification.py
index a0cd0990..f4d5c08a 100644
--- a/modelscope/preprocessors/ofa/image_classification.py
+++ b/modelscope/preprocessors/ofa/image_classification.py
@@ -6,21 +6,27 @@ from PIL import Image
 from torchvision import transforms
 
 from modelscope.preprocessors.image import load_image
+from modelscope.utils.constant import ModeKeys
 from .base import OfaBasePreprocessor
 
 
 class OfaImageClassificationPreprocessor(OfaBasePreprocessor):
 
-    def __init__(self, cfg, model_dir, split, *args, **kwargs):
+    def __init__(self,
+                 cfg,
+                 model_dir,
+                 mode=ModeKeys.INFERENCE,
+                 *args,
+                 **kwargs):
         """preprocess the data
 
         Args:
             cfg(modelscope.utils.config.ConfigDict) : model config
             model_dir (str): model path,
-            split: data phase
+            mode: preprocessor mode (model mode)
         """
         super(OfaImageClassificationPreprocessor,
-              self).__init__(cfg, model_dir, split, *args, **kwargs)
+              self).__init__(cfg, model_dir, mode, *args, **kwargs)
         # Initialize transform
         self.patch_resize_transform = transforms.Compose([
             lambda image: image.convert('RGB'),
diff --git a/modelscope/preprocessors/ofa/summarization.py b/modelscope/preprocessors/ofa/summarization.py
index 00ae9bf9..9867954a 100644
--- a/modelscope/preprocessors/ofa/summarization.py
+++ b/modelscope/preprocessors/ofa/summarization.py
@@ -1,21 +1,27 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Any, Dict
 
+from modelscope.utils.constant import ModeKeys
 from .base import OfaBasePreprocessor
 
 
 class OfaSummarizationPreprocessor(OfaBasePreprocessor):
 
-    def __init__(self, cfg, model_dir, split, *args, **kwargs):
+    def __init__(self,
+                 cfg,
+                 model_dir,
+                 mode=ModeKeys.INFERENCE,
+                 *args,
+                 **kwargs):
         """preprocess the data
 
         Args:
             cfg(modelscope.utils.config.ConfigDict) : model config
             model_dir (str): model path,
-            split: data phase
+            mode: preprocessor mode (model mode)
         """
         super(OfaSummarizationPreprocessor,
-              self).__init__(cfg, model_dir, split, *args, **kwargs)
+              self).__init__(cfg, model_dir, mode, *args, **kwargs)
 
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
         source = super().pre_caption(
diff --git a/modelscope/preprocessors/ofa/text_classification.py b/modelscope/preprocessors/ofa/text_classification.py
index 25981e65..06e35b78 100644
--- a/modelscope/preprocessors/ofa/text_classification.py
+++ b/modelscope/preprocessors/ofa/text_classification.py
@@ -1,21 +1,27 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Any, Dict
 
+from modelscope.utils.constant import ModeKeys
 from .base import OfaBasePreprocessor
 
 
 class OfaTextClassificationPreprocessor(OfaBasePreprocessor):
 
-    def __init__(self, cfg, model_dir, split, *args, **kwargs):
+    def __init__(self,
+                 cfg,
+                 model_dir,
+                 mode=ModeKeys.INFERENCE,
+                 *args,
+                 **kwargs):
         """preprocess the data
 
         Args:
             cfg(modelscope.utils.config.ConfigDict) : model config
             model_dir (str): model path,
-            split: data phase
+            mode: preprocessor mode (model mode)
         """
         super(OfaTextClassificationPreprocessor,
-              self).__init__(cfg, model_dir, split, *args, **kwargs)
+              self).__init__(cfg, model_dir, mode, *args, **kwargs)
 
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
         text1 = ' '.join(
diff --git a/modelscope/preprocessors/ofa/text_to_image_synthesis.py b/modelscope/preprocessors/ofa/text_to_image_synthesis.py
index 56198e67..ebedd6fc 100644
--- a/modelscope/preprocessors/ofa/text_to_image_synthesis.py
+++ b/modelscope/preprocessors/ofa/text_to_image_synthesis.py
@@ -3,21 +3,27 @@ from typing import Any, Dict
 
 import torch
 
+from modelscope.utils.constant import ModeKeys
 from .base import OfaBasePreprocessor
 
 
 class OfaTextToImageSynthesisPreprocessor(OfaBasePreprocessor):
 
-    def __init__(self, cfg, model_dir, split, *args, **kwargs):
+    def __init__(self,
+                 cfg,
+                 model_dir,
+                 mode=ModeKeys.INFERENCE,
+                 *args,
+                 **kwargs):
         """preprocess the data
 
         Args:
             cfg(modelscope.utils.config.ConfigDict) : model config
             model_dir (str): model path,
-            split: data phase
+            mode: preprocessor mode (model mode)
         """
         super(OfaTextToImageSynthesisPreprocessor,
-              self).__init__(cfg, model_dir, split, *args, **kwargs)
+              self).__init__(cfg, model_dir, mode, *args, **kwargs)
         self.max_src_length = 64
 
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
diff --git a/modelscope/preprocessors/ofa/visual_entailment.py b/modelscope/preprocessors/ofa/visual_entailment.py
index 45c719b1..1cc5bc5c 100644
--- a/modelscope/preprocessors/ofa/visual_entailment.py
+++ b/modelscope/preprocessors/ofa/visual_entailment.py
@@ -6,21 +6,27 @@ from PIL import Image
 from torchvision import transforms
 
 from modelscope.preprocessors.image import load_image
+from modelscope.utils.constant import ModeKeys
 from .base import OfaBasePreprocessor
 
 
 class OfaVisualEntailmentPreprocessor(OfaBasePreprocessor):
 
-    def __init__(self, cfg, model_dir, split, *args, **kwargs):
+    def __init__(self,
+                 cfg,
+                 model_dir,
+                 mode=ModeKeys.INFERENCE,
+                 *args,
+                 **kwargs):
         """preprocess the data
 
         Args:
             cfg(modelscope.utils.config.ConfigDict) : model config
             model_dir (str): model path,
-            split: data phase
+            mode: preprocessor mode (model mode)
         """
         super(OfaVisualEntailmentPreprocessor,
-              self).__init__(cfg, model_dir, split, *args, **kwargs)
+              self).__init__(cfg, model_dir, mode, *args, **kwargs)
         # Initialize transform
         self.patch_resize_transform = transforms.Compose([
             lambda image: image.convert('RGB'),
diff --git a/modelscope/preprocessors/ofa/visual_grounding.py b/modelscope/preprocessors/ofa/visual_grounding.py
index eaaed0ef..43f80c7b 100644
--- a/modelscope/preprocessors/ofa/visual_grounding.py
+++ b/modelscope/preprocessors/ofa/visual_grounding.py
@@ -6,21 +6,27 @@ from PIL import Image
 from torchvision import transforms
 
 from modelscope.preprocessors.image import load_image
+from modelscope.utils.constant import ModeKeys
 from .base import OfaBasePreprocessor
 
 
 class OfaVisualGroundingPreprocessor(OfaBasePreprocessor):
 
-    def __init__(self, cfg, model_dir, split, *args, **kwargs):
+    def __init__(self,
+                 cfg,
+                 model_dir,
+                 mode=ModeKeys.INFERENCE,
+                 *args,
+                 **kwargs):
         """preprocess the data
 
         Args:
             cfg(modelscope.utils.config.ConfigDict) : model config
             model_dir (str): model path,
-            split: data phase
+            mode: preprocessor mode (model mode)
         """
         super(OfaVisualGroundingPreprocessor,
-              self).__init__(cfg, model_dir, split, *args, **kwargs)
+              self).__init__(cfg, model_dir, mode, *args, **kwargs)
         # Initialize transform
         self.patch_resize_transform = transforms.Compose([
             lambda image: image.convert('RGB'),
diff --git a/modelscope/preprocessors/ofa/visual_question_answering.py b/modelscope/preprocessors/ofa/visual_question_answering.py
index bce18c95..01c22537 100644
--- a/modelscope/preprocessors/ofa/visual_question_answering.py
+++ b/modelscope/preprocessors/ofa/visual_question_answering.py
@@ -6,21 +6,27 @@ from PIL import Image
 from torchvision import transforms
 
 from modelscope.preprocessors.image import load_image
+from modelscope.utils.constant import ModeKeys
 from .base import OfaBasePreprocessor
 
 
 class OfaVisualQuestionAnsweringPreprocessor(OfaBasePreprocessor):
 
-    def __init__(self, cfg, model_dir, split, *args, **kwargs):
+    def __init__(self,
+                 cfg,
+                 model_dir,
+                 mode=ModeKeys.INFERENCE,
+                 *args,
+                 **kwargs):
         """preprocess the data
 
         Args:
             cfg(modelscope.utils.config.ConfigDict) : model config
             model_dir (str): model path,
-            split: data phase
+            mode: preprocessor mode (model mode)
         """
         super(OfaVisualQuestionAnsweringPreprocessor,
-              self).__init__(cfg, model_dir, split, *args, **kwargs)
+              self).__init__(cfg, model_dir, mode, *args, **kwargs)
         # Initialize transform
         self.patch_resize_transform = transforms.Compose([
             lambda image: image.convert('RGB'),
diff --git a/modelscope/trainers/multi_modal/ofa/__init__.py b/modelscope/trainers/multi_modal/ofa/__init__.py
index e69de29b..7222c48c 100644
--- a/modelscope/trainers/multi_modal/ofa/__init__.py
+++ b/modelscope/trainers/multi_modal/ofa/__init__.py
@@ -0,0 +1 @@
+from .ofa_trainer import OFATrainer
diff --git a/modelscope/trainers/multi_modal/ofa/ofa_file_dataset.py b/modelscope/trainers/multi_modal/ofa/ofa_file_dataset.py
index 2f64f9ff..17c9398a 100644
--- a/modelscope/trainers/multi_modal/ofa/ofa_file_dataset.py
+++ b/modelscope/trainers/multi_modal/ofa/ofa_file_dataset.py
@@ -78,6 +78,8 @@ class OFAFileDataset:
                 self.lineid_to_offset.append(offset)
                 self.total_row_count += 1
                 offset += len(line.encode('utf-8'))
+            pickle.dump(self.lineid_to_offset,
+                        open('{}.index'.format(self.file_path), 'rb'))
         self._compute_start_pos_and_row_count()
         print(
             'local datafile {} slice_id {} finished initializing row_count and line_idx-to-offset mapping'
diff --git a/modelscope/trainers/multi_modal/ofa/ofa_trainer.py b/modelscope/trainers/multi_modal/ofa/ofa_trainer.py
index e69de29b..af2fca0a 100644
--- a/modelscope/trainers/multi_modal/ofa/ofa_trainer.py
+++ b/modelscope/trainers/multi_modal/ofa/ofa_trainer.py
@@ -0,0 +1,120 @@
+import os
+from os import path as osp
+from typing import Dict, Optional
+
+import torch
+import torch.distributed as dist
+import transformers
+from torch.utils.data import DataLoader
+from torch.utils.data.distributed import DistributedSampler
+
+from modelscope.metainfo import Trainers
+from modelscope.models.base import Model
+from modelscope.preprocessors.multi_modal import OfaPreprocessor
+from modelscope.preprocessors.ofa.utils.collate import collate_fn
+from modelscope.trainers.base import BaseTrainer
+from modelscope.trainers.builder import TRAINERS
+from modelscope.utils.constant import ModeKeys, ModelFile
+from modelscope.utils.logger import get_logger
+from modelscope.utils.torch_utils import init_dist
+from .ofa_trainer_utils import (AdjustLabelSmoothedCrossEntropyCriterion,
+                                OFADataset, get_schedule)
+
+logger = get_logger()
+
+
+@TRAINERS.register_module(module_name=Trainers.ofa_tasks)
+class OFATrainer(BaseTrainer):
+
+    def __init__(self, model: str, *args, **kwargs):
+        model = Model.from_pretrained(model)
+        super().__init__(osp.join(model.model_dir, ModelFile.CONFIGURATION))
+        self.model_dir = model.model_dir
+        self.model = model.model
+        self.device_id = 0
+        self.total_epoch = self.cfg.train.epoch
+        self.train_batch_size = self.cfg.train.batch_size
+        self.val_batch_size = self.cfg.evaluation.batch_size
+        self.save_dir = self.cfg.train.save_dir
+        init_dist(launcher='pytorch')
+        self.train_dataset = OFADataset(
+            file_path=self.cfg.dataset.train_set,
+            selected_id_keys=self.cfg.dataset.selected_id_keys,
+            preprocessor=OfaPreprocessor(
+                model_dir=self.model_dir, split=ModeKeys.TRAIN),
+        )
+        self.val_dataset = OFADataset(
+            file_path=self.cfg.dataset.valid_set,
+            selected_id_keys=self.cfg.dataset.selected_id_keys,
+            preprocessor=OfaPreprocessor(
+                model_dir=self.model_dir, split=ModeKeys.EVAL),
+        )
+        epoch_steps = len(
+            self.train_dataset) // self.cfg.train.gradient_accumulation_steps
+        self.cfg.train.num_train_steps = epoch_steps * self.cfg.train.epoch
+        self.criterion = AdjustLabelSmoothedCrossEntropyCriterion(
+            self.cfg.train.criterion)
+
+    def train(self, *args, **kwargs):
+        assert dist.is_initialized()
+
+        self.model.train()
+        self.model.to(self.device_id)
+        ddp_model = torch.nn.parallel.DistributedDataParallel(
+            self.model, device_ids=[
+                self.device_id,
+            ])
+
+        optimizer = transformers.AdamW(
+            self.model.parameters(),
+            lr=self.cfg.train.lr,
+            weight_decay=self.cfg.train.weight_decay,
+            correct_bias=False,
+        )
+        scheduler_class, scheduler_args = get_schedule(self.cfg.train)
+        if scheduler_class is not None:
+            lr_scheduler = scheduler_class(**{'optimizer': optimizer},
+                                           **scheduler_args)
+        else:
+            lr_scheduler = None
+        for epoch in range(self.total_epoch):
+            train_sampler = DistributedSampler(
+                dataset=self.train_dataset, shuffle=True)
+            train_sampler.set_epoch(epoch)
+
+            train_params = {
+                'pin_memory': True,
+                'collate_fn': collate_fn,
+                'batch_size': self.train_batch_size,
+                'shuffle': False,
+                'drop_last': True,
+                'sampler': train_sampler,
+                'num_workers': 2,
+            }
+
+            train_loader = DataLoader(self.train_dataset, **train_params)
+
+            for idx, batch in enumerate(train_loader, start=1):
+                model_outputs = ddp_model(**batch)
+                loss, sample_size, logging_output = self.criterion(
+                    model_outputs, batch)
+                loss.backward()
+                optimizer.zero_grad()
+                if lr_scheduler is not None:
+                    lr_scheduler.step()
+                optimizer.step()
+                optimizer.zero_grad()
+                if idx % 10 == 0:
+                    logger.info(
+                        'epoch: {}, train batch {}/{}, loss={:.5f}'.format(
+                            epoch, idx, len(train_loader), loss.item()))
+            if dist.get_rank() == 0:
+                os.makedirs(self.ckpt_dir, exist_ok=True)
+                torch.save(ddp_model.module.state_dict(),
+                           f'{self.ckpt_dir}/epoch{epoch}.bin')
+
+    def evaluate(self,
+                 checkpoint_path: Optional[str] = None,
+                 *args,
+                 **kwargs) -> Dict[str, float]:
+        pass
diff --git a/modelscope/trainers/multi_modal/ofa/ofa_trainer_utils.py b/modelscope/trainers/multi_modal/ofa/ofa_trainer_utils.py
index 92a22bb4..10acc870 100644
--- a/modelscope/trainers/multi_modal/ofa/ofa_trainer_utils.py
+++ b/modelscope/trainers/multi_modal/ofa/ofa_trainer_utils.py
@@ -2,36 +2,36 @@
 # All rights reserved.
 # This source code is licensed under the Apache 2.0 license
 # found in the LICENSE file in the root directory.
-from os import path as osp
+import math
 
+import numpy as np
+import torch
+import torch.nn.functional as F
+import transformers
+from torch.nn.modules.loss import _Loss
 from torch.utils.data import Dataset
 
-from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.preprocessors.multi_modal import OfaPreprocessor
-from modelscope.utils.config import Config
-from modelscope.utils.constant import Fields, ModeKeys, ModelFile, Tasks
 from .ofa_file_dataset import OFAFileDataset
 
 
 class OFADataset(Dataset):
 
     def __init__(self,
-                 model_dir,
-                 file_path,
+                 file_path: str,
+                 preprocessor: OfaPreprocessor,
+                 selected_id_keys: str,
                  dtypes=None,
                  separator='\t',
                  cached_index=False,
-                 split=ModeKeys.TRAIN,
                  **kwargs):
-        self.cfg = Config.from_file(
-            osp.join(model_dir, ModelFile.CONFIGURATION))
-        selected_col_ids = self.cfg.dataset.selected_col_ids
-        selected_col_keys = self.cfg.dataset.selected_col_keys
-
-        assert selected_col_ids is not None
-        assert selected_col_keys is not None
-        self.selected_col_key_l = selected_col_keys.split(',')
-        assert len(self.selected_col_key_l) == len(selected_col_ids.split(','))
+        assert selected_id_keys is not None
+        selected_col_ids = list()
+        selected_col_keys = list()
+        for id_key in selected_id_keys.split(','):
+            id, key = id_key.split(':')
+            selected_col_ids.append(id)
+            selected_col_keys.append(key)
 
         self.dataset = OFAFileDataset(
             file_path=file_path,
@@ -39,14 +39,278 @@ class OFADataset(Dataset):
             dtypes=dtypes,
             separator=separator,
             cached_index=cached_index)
-        self.preprocessor = OfaPreprocessor(model_dir, split)
+        self.preprocessor = preprocessor
 
     def __len__(self):
         return len(self.dataset)
 
     def __getitem__(self, index):
-        value_l = self.dataset[index]
+        values = self.dataset[index]
         data = dict()
-        for key, value in zip(self.selected_col_key_l, value_l):
+        for key, value in zip(self.selected_col_keys, values):
             data[key] = value
         return self.preprocessor(data)
+
+
+def construct_rdrop_sample(x):
+    if isinstance(x, dict):
+        for key in x:
+            x[key] = construct_rdrop_sample(x[key])
+        return x
+    elif isinstance(x, torch.Tensor):
+        return x.repeat(2, *([1] * (x.dim() - 1)))
+    elif isinstance(x, int):
+        return x * 2
+    elif isinstance(x, np.ndarray):
+        return x.repeat(2)
+    else:
+        raise NotImplementedError
+
+
+def kl_loss(p, q):
+    p_loss = F.kl_div(p, torch.exp(q), reduction='sum')
+    q_loss = F.kl_div(q, torch.exp(p), reduction='sum')
+    loss = (p_loss + q_loss) / 2
+    return loss
+
+
+def label_smoothed_nll_loss(lprobs,
+                            target,
+                            epsilon,
+                            update_num,
+                            reduce=True,
+                            drop_worst_ratio=0.0,
+                            drop_worst_after=0,
+                            use_rdrop=False,
+                            reg_alpha=1.0,
+                            constraint_masks=None,
+                            constraint_start=None,
+                            constraint_end=None):
+    if target.dim() == lprobs.dim() - 1:
+        target = target.unsqueeze(-1)
+    nll_loss = -lprobs.gather(dim=-1, index=target).squeeze(-1)
+    if constraint_masks is not None:
+        smooth_loss = -lprobs.masked_fill(~constraint_masks, 0).sum(
+            dim=-1, keepdim=True).squeeze(-1)
+        eps_i = epsilon / (constraint_masks.sum(1) - 1 + 1e-6)
+    elif constraint_start is not None and constraint_end is not None:
+        constraint_range = [0, 1, 2, 3] + list(
+            range(constraint_start, constraint_end))
+        smooth_loss = -lprobs[:, constraint_range].sum(
+            dim=-1, keepdim=True).squeeze(-1)
+        eps_i = epsilon / (len(constraint_range) - 1 + 1e-6)
+    else:
+        smooth_loss = -lprobs.sum(dim=-1, keepdim=True).squeeze(-1)
+        eps_i = epsilon / (lprobs.size(-1) - 1)
+    loss = (1.0 - epsilon - eps_i) * nll_loss + eps_i * smooth_loss
+    if drop_worst_ratio > 0 and update_num > drop_worst_after:
+        if use_rdrop:
+            true_batch_size = loss.size(0) // 2
+            _, indices = torch.topk(
+                loss[:true_batch_size],
+                k=int(true_batch_size * (1 - drop_worst_ratio)),
+                largest=False)
+            loss = torch.cat([loss[indices], loss[indices + true_batch_size]])
+            nll_loss = torch.cat(
+                [nll_loss[indices], nll_loss[indices + true_batch_size]])
+            lprobs = torch.cat(
+                [lprobs[indices], lprobs[indices + true_batch_size]])
+        else:
+            loss, indices = torch.topk(
+                loss,
+                k=int(loss.shape[0] * (1 - drop_worst_ratio)),
+                largest=False)
+            nll_loss = nll_loss[indices]
+            lprobs = lprobs[indices]
+
+    ntokens = loss.numel()
+    nll_loss = nll_loss.sum()
+    loss = loss.sum()
+    if use_rdrop:
+        true_batch_size = lprobs.size(0) // 2
+        p = lprobs[:true_batch_size]
+        q = lprobs[true_batch_size:]
+        if constraint_start is not None and constraint_end is not None:
+            constraint_range = [0, 1, 2, 3] + list(
+                range(constraint_start, constraint_end))
+            p = p[:, constraint_range]
+            q = q[:, constraint_range]
+        loss += kl_loss(p, q) * reg_alpha
+
+    return loss, nll_loss, ntokens
+
+
+class AdjustLabelSmoothedCrossEntropyCriterion(_Loss):
+
+    def __init__(self, args):
+        super().__init__()
+        self.sentence_avg = args.sentence_avg
+        self.eps = args.label_smoothing
+        self.ignore_prefix_size = args.ignore_prefix_size
+        self.ignore_eos = args.ignore_eos
+        self.report_accuracy = args.report_accuracy
+        self.drop_worst_ratio = args.drop_worst_ratio
+        self.drop_worst_after = args.drop_worst_after
+        self.use_rdrop = args.use_rdrop
+        self.reg_alpha = args.reg_alpha
+        self.sample_patch_num = args.sample_patch_num
+
+        self.constraint_start = None
+        self.constraint_end = None
+        if args.constraint_range is not None:
+            constraint_start, constraint_end = args.constraint_range.split(',')
+            self.constraint_start = int(constraint_start)
+            self.constraint_end = int(constraint_end)
+        self.padding_idx = args.tokenizer.pad_token_id
+        self.args = args
+
+    def forward(self, output, sample, update_num=0, reduce=True):
+        """Compute the loss for the given sample.
+
+        Returns a tuple with three elements:
+        1) the loss
+        2) the sample size, which is used as the denominator for the gradient
+        3) logging outputs to display while training
+        """
+        if isinstance(sample, list):
+            if self.sample_patch_num > 0:
+                sample[0]['net_input'][
+                    'sample_patch_num'] = self.sample_patch_num
+            loss_v1, sample_size_v1, logging_output_v1 = self.forward(
+                output[0], sample[0], update_num, reduce)
+            loss_v2, sample_size_v2, logging_output_v2 = self.forward(
+                output[1], sample[1], update_num, reduce)
+            loss = loss_v1 / sample_size_v1 + loss_v2 / sample_size_v2
+            sample_size = 1
+            logging_output = {
+                'loss':
+                loss.data,
+                'loss_v1':
+                loss_v1.data,
+                'loss_v2':
+                loss_v2.data,
+                'nll_loss':
+                logging_output_v1['nll_loss'].data / sample_size_v1
+                + logging_output_v2['nll_loss'].data / sample_size_v2,
+                'ntokens':
+                logging_output_v1['ntokens'] + logging_output_v2['ntokens'],
+                'nsentences':
+                logging_output_v1['nsentences']
+                + logging_output_v2['nsentences'],
+                'sample_size':
+                1,
+                'sample_size_v1':
+                sample_size_v1,
+                'sample_size_v2':
+                sample_size_v2,
+            }
+            return loss, sample_size, logging_output
+
+        if self.use_rdrop:
+            construct_rdrop_sample(sample)
+
+        net_output = output
+        # model(**sample["net_input"])
+        loss, nll_loss, ntokens = self.compute_loss(
+            net_output, sample, update_num, reduce=reduce)
+        sample_size = (
+            sample['target'].size(0) if self.sentence_avg else ntokens)
+        logging_output = {
+            'loss': loss.data,
+            'nll_loss': nll_loss.data,
+            'ntokens': sample['ntokens'],
+            'nsentences': sample['nsentences'],
+            'sample_size': sample_size,
+        }
+        return loss, sample_size, logging_output
+
+    def get_lprobs_and_target(self, net_output, sample):
+        conf = sample['conf'][:, None, None] if 'conf' in sample and sample[
+            'conf'] is not None else 1
+        constraint_masks = None
+        if 'constraint_masks' in sample and sample[
+                'constraint_masks'] is not None:
+            constraint_masks = sample['constraint_masks']
+            net_output[0].masked_fill_(~constraint_masks, -math.inf)
+        if self.constraint_start is not None and self.constraint_end is not None:
+            net_output[0][:, :, 4:self.constraint_start] = -math.inf
+            net_output[0][:, :, self.constraint_end:] = -math.inf
+        lprobs = F.log_softmax(
+            net_output[0], dim=-1, dtype=torch.float32) * conf
+        target = sample['target']
+        if self.ignore_prefix_size > 0:
+            lprobs = lprobs[:, self.ignore_prefix_size:, :].contiguous()
+            target = target[:, self.ignore_prefix_size:].contiguous()
+            if constraint_masks is not None:
+                constraint_masks = constraint_masks[:, self.ignore_prefix_size:, :].contiguous()  # yapf: disable
+        if self.ignore_eos:
+            bsz, seq_len, embed_dim = lprobs.size()
+            eos_indices = target.eq(self.task.tgt_dict.eos())
+            lprobs = lprobs[~eos_indices].reshape(bsz, seq_len - 1, embed_dim)
+            target = target[~eos_indices].reshape(bsz, seq_len - 1)
+            if constraint_masks is not None:
+                constraint_masks = constraint_masks[~eos_indices].reshape(
+                    bsz, seq_len - 1, embed_dim)
+        if constraint_masks is not None:
+            constraint_masks = constraint_masks.view(-1,
+                                                     constraint_masks.size(-1))
+        return lprobs.view(-1,
+                           lprobs.size(-1)), target.view(-1), constraint_masks
+
+    def compute_loss(self, net_output, sample, update_num, reduce=True):
+        lprobs, target, constraint_masks = self.get_lprobs_and_target(
+            net_output, sample)
+        if constraint_masks is not None:
+            constraint_masks = constraint_masks[target != self.padding_idx]
+        lprobs = lprobs[target != self.padding_idx]
+        target = target[target != self.padding_idx]
+        loss, nll_loss, ntokens = label_smoothed_nll_loss(
+            lprobs,
+            target,
+            self.eps,
+            update_num,
+            reduce=reduce,
+            drop_worst_ratio=self.drop_worst_ratio,
+            drop_worst_after=self.drop_worst_after,
+            use_rdrop=self.use_rdrop,
+            reg_alpha=self.reg_alpha,
+            constraint_masks=constraint_masks,
+            constraint_start=self.constraint_start,
+            constraint_end=self.constraint_end)
+        return loss, nll_loss, ntokens
+
+
+def get_schedule(args):
+
+    if args.schedule == 'const':
+        scheduler_class = transformers.get_constant_schedule_with_warmup
+        scheduler_args = {
+            'num_warmup_steps':
+            int(args.warmup_proportion * args.num_train_steps)
+        }
+    elif args.schedule == 'linear':
+        scheduler_class = transformers.get_linear_schedule_with_warmup
+        scheduler_args = {
+            'num_warmup_steps':
+            int(args.warmup_proportion * args.num_train_steps),
+            'num_training_steps': args.num_train_steps
+        }
+    elif args.schedule == 'cosine':
+        scheduler_class = transformers.get_cosine_schedule_with_warmup
+        scheduler_args = {
+            'num_warmup_steps':
+            int(args.warmup_proportion * args.num_train_steps),
+            'num_training_steps': args.num_train_steps
+        }
+    elif args.schedule == 'polynomial_decay':
+        scheduler_class = transformers.get_polynomial_decay_schedule_with_warmup
+        scheduler_args = {
+            'num_warmup_steps':
+            int(args.warmup_proportion * args.num_train_steps),
+            'num_training_steps': args.num_train_steps,
+            'lr_end': args.lr_end
+        }
+    else:
+        raise NotImplementedError
+
+    return scheduler_class, scheduler_args
diff --git a/modelscope/utils/multi_modal/fp16/__init__.py b/modelscope/utils/multi_modal/fp16/__init__.py
new file mode 100644
index 00000000..81250858
--- /dev/null
+++ b/modelscope/utils/multi_modal/fp16/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .fp16 import FP16_Module, FP16_Optimizer
diff --git a/modelscope/utils/multi_modal/fp16/fp16.py b/modelscope/utils/multi_modal/fp16/fp16.py
new file mode 100755
index 00000000..37a80e65
--- /dev/null
+++ b/modelscope/utils/multi_modal/fp16/fp16.py
@@ -0,0 +1,655 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Stable version of apex FP16 Optimizer"""
+import torch
+from torch import nn
+from torch.autograd import Variable
+from torch.nn.parameter import Parameter
+
+from .fp16util import (master_params_to_model_params,
+                       model_grads_to_master_grads)
+from .loss_scaler import DynamicLossScaler, LossScaler
+
+FLOAT_TYPES = (torch.FloatTensor, torch.cuda.FloatTensor)
+HALF_TYPES = (torch.HalfTensor, torch.cuda.HalfTensor)
+
+
+def conversion_helper(val, conversion):
+    """Apply conversion to val. Recursively apply conversion if `val` is a nested tuple/list structure."""
+    if not isinstance(val, (tuple, list)):
+        return conversion(val)
+    rtn = [conversion_helper(v, conversion) for v in val]
+    if isinstance(val, tuple):
+        rtn = tuple(rtn)
+    return rtn
+
+
+def fp32_to_fp16(val):
+    """Convert fp32 `val` to fp16"""
+
+    def half_conversion(val):
+        val_typecheck = val
+        if isinstance(val_typecheck, (Parameter, Variable)):
+            val_typecheck = val.data
+        if isinstance(val_typecheck, FLOAT_TYPES):
+            val = val.half()
+        return val
+
+    return conversion_helper(val, half_conversion)
+
+
+def fp16_to_fp32(val):
+    """Convert fp16 `val` to fp32"""
+
+    def float_conversion(val):
+        val_typecheck = val
+        if isinstance(val_typecheck, (Parameter, Variable)):
+            val_typecheck = val.data
+        if isinstance(val_typecheck, HALF_TYPES):
+            val = val.float()
+        return val
+
+    return conversion_helper(val, float_conversion)
+
+
+class FP16_Module(nn.Module):
+
+    def __init__(self, module):
+        super(FP16_Module, self).__init__()
+        self.add_module('module', module.half())
+
+    def forward(self, *inputs, **kwargs):
+        return fp16_to_fp32(self.module(*(fp32_to_fp16(inputs)), **kwargs))
+
+    def state_dict(self, destination=None, prefix='', keep_vars=False):
+        return self.module.state_dict(destination, prefix, keep_vars)
+
+    def load_state_dict(self, state_dict, strict=True):
+        self.module.load_state_dict(state_dict, strict=strict)
+
+
+class FP16_Optimizer(object):
+    """
+    :class:`FP16_Optimizer` is designed to wrap an existing PyTorch optimizer,
+    and manage static or dynamic loss scaling and master weights in a manner transparent to the user.
+    For standard use, only two lines must be changed:  creating the :class:`FP16_Optimizer` instance,
+    and changing the call to ``backward``.
+
+    Example::
+
+        model = torch.nn.Linear(D_in, D_out).cuda().half()
+        optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
+        # Name the FP16_Optimizer instance to replace the existing optimizer
+        # (recommended but not required):
+        optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
+        ...
+        # loss.backward() becomes:
+        optimizer.backward(loss)
+        ...
+
+    Example with dynamic loss scaling::
+
+        ...
+        optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
+                                   # optional arg to control dynamic loss scaling behavior
+                                   # dynamic_loss_args={'scale_window' : 500})
+                                   # Usually, dynamic_loss_args is not necessary.
+
+    Args:
+        init_optimizer (torch.optim.optimizer):  Existing optimizer created with the parameters to optimize.  Internally, :class:`FP16_Optimizer` replaces the passed optimizer's fp16 parameters, if any, with fp32 master parameters copied from the original ones.  :class:`FP16_Optimizer` also stores references to the original fp16 parameters, and updates these fp16 parameters from the master fp32 copy at the end of each :attr:`step`.  # noqa
+        static_loss_scale (float, optional, default=1.0):  Loss scale used internally to scale gradients computed by the model.  Any fp16 gradients will be copied to fp32, then downscaled before being applied to the fp32 master params, so ``static_loss_scale`` should not affect learning rate.  # noqa
+        dynamic_loss_scale (bool, optional, default=False):  Use dynamic loss scaling.  If True, this will override any ``static_loss_scale`` option. # noqa
+        dynamic_loss_args (dict, optional, default=None):  Dict of kwargs that will be forwarded to the internal :class:`DynamicLossScaler` instance's constructor.  Keys of this dict must match kwargs accepted by :class:`DynamicLossScaler`'s constructor.  If ``dynamic_loss_args`` is unspecified, :class:`DynamicLossScaler`'s defaults will be used. # noqa
+        verbose (bool, optional, default=True):  By default, FP16_Optimizer's constructor prints out the parameters and parameter groups it is ingesting, as a sanity check.  If this becomes annoying (e.g. for large models), it can be disabled by passing ``verbose=False``.  ``verbose=False`` will not disable printing when the loss scale is readjusted during dynamic loss scaling. # noqa
+
+    ``init_optimizer`` is expected to have been constructed in the ordinary way.
+    It is recommended (although not required) that the newly constructed :class:`FP16_Optimizer` instance be
+    named to replace ``init_optimizer``, for two reasons:
+    First, it means that references to the same name
+    later in the file will not have to change.
+    Second, :class:`FP16_Optimizer` reserves the right (as an implementation detail) to
+    modify ``init_optimizer``.  If you do choose a unique name for the new
+    :class:`FP16_Optimizer` instance, you should only work with this new instance,
+    because the preexisting optimizer might no longer behave as expected.
+
+    ``init_optimizer`` may be any Pytorch optimizer.
+    It may contain a mixture of fp16 and fp32 parameters organized into any number of
+    ``param_groups`` with different hyperparameters.  The :class:`FP16_Optimizer` constructor will
+    ingest these ``param_groups`` and remember them.
+
+    Calls to ::
+
+        loss.backward()
+
+    must be replaced with ::
+
+        optimizer.backward(loss)
+
+    because :class:`FP16_Optimizer` requires ownership of the backward pass to implement
+    loss scaling and copies to master gradients.
+
+    .. note::
+        Loss scaling, either static or dynamic, is orthogonal to learning rate, because gradients
+        are downscaled before being applied.  This means that adjusting the loss scale, or using
+        dynamic loss scaling, should not require retuning the learning rate or any other
+        hyperparameters.
+
+
+    **Advanced options**
+
+    **Closures**:  :class:`FP16_Optimizer` can wrap a Pytorch optimizer that receives a closure.
+    See docstring for :attr:`step`.
+
+    **Gradient clipping**:  Use :attr:`clip_master_grads`.
+
+    **Multiple losses**:  If your model accumulates gradients from multiple losses,
+    this can be made more efficient by supplying ``update_master_grads=False``
+    to :attr:`backward`.  See docstring for :attr:`backward`.
+
+    **Manually adjusting loss scale**:  The current loss scale can be retrieved or set via ::
+
+        print(optimizer.loss_scale)
+        optimizer.loss_scale = new_loss_scale
+
+    For static loss scaling, manually adjusting the loss scale over time is a reasonable
+    thing to do.  During later epochs, gradients may become smaller, and a
+    higher loss scale may be required, analogous to scheduling the learning rate.  Dynamic loss
+    scaling is more subtle (see :class:`DynamicLossScaler`) and in this case, manually adjusting
+    the loss scale is not recommended.
+
+    **Multi_GPU training**:  If the wrapped ``init_optimizer`` was created from a model wrapped in
+    Pytorch DistributedDataParallel or Apex DistributedDataParallel, :class:`FP16_Optimizer`
+    should still work as intended.
+    """
+
+    def __init__(self,
+                 init_optimizer,
+                 static_loss_scale=1.0,
+                 dynamic_loss_scale=False,
+                 dynamic_loss_args=None,
+                 verbose=False):
+        if not torch.cuda.is_available:
+            raise SystemError('Cannot use fp16 without CUDA.')
+
+        self.verbose = verbose
+
+        self.optimizer = init_optimizer
+        # init_state_dict sets up an alternative way to cast per-param state tensors.
+        # Stashing here in case https://github.com/pytorch/pytorch/issues/7733 makes it necessary.
+        # init_state_dict = init_optimizer.state_dict()
+
+        self.fp16_groups = []
+        self.fp32_from_fp16_groups = []
+        self.fp32_from_fp32_groups = []
+        for i, param_group in enumerate(self.optimizer.param_groups):
+            self.maybe_print(
+                'FP16_Optimizer processing param group {}:'.format(i))
+            fp16_params_this_group = []
+            fp32_params_this_group = []
+            fp32_from_fp16_params_this_group = []
+            for i, param in enumerate(param_group['params']):
+                if param.requires_grad:
+                    if param.type() == 'torch.cuda.HalfTensor':
+                        self.maybe_print(
+                            'FP16_Optimizer received torch.cuda.HalfTensor with {}'
+                            .format(param.size()))
+                        fp16_params_this_group.append(param)
+                        master_param = param.detach().clone().float()
+                        master_param.requires_grad = True
+                        # Copythe model parallel flag.
+                        master_param.model_parallel = param.model_parallel
+                        param_group['params'][i] = master_param
+                        fp32_from_fp16_params_this_group.append(master_param)
+                        # Reset existing state dict key to the new master param.
+                        # We still need to recast per-param state tensors, if any, to FP32.
+                        if param in self.optimizer.state:
+                            self.optimizer.state[
+                                master_param] = self.optimizer.state.pop(param)
+                    elif param.type() == 'torch.cuda.FloatTensor':
+                        self.maybe_print(
+                            'FP16_Optimizer received torch.cuda.FloatTensor with {}'
+                            .format(param.size()))
+                        fp32_params_this_group.append(param)
+                        param_group['params'][i] = param
+                    else:
+                        raise TypeError(
+                            'Wrapped parameters must be either '
+                            'torch.cuda.FloatTensor or torch.cuda.HalfTensor. '
+                            'Received {}'.format(param.type()))
+
+            self.fp16_groups.append(fp16_params_this_group)
+            self.fp32_from_fp16_groups.append(fp32_from_fp16_params_this_group)
+            self.fp32_from_fp32_groups.append(fp32_params_this_group)
+
+        # Leverage state_dict() and load_state_dict() to recast preexisting per-param state tensors
+        self.optimizer.load_state_dict(self.optimizer.state_dict())
+        # alternative way to cast per-param state tensors:
+        # self.optimizer.load_state_dict(init_state_dict)
+
+        if dynamic_loss_scale:
+            self.dynamic_loss_scale = True
+            if dynamic_loss_args is not None:
+                self.loss_scaler = DynamicLossScaler(**dynamic_loss_args)
+            else:
+                self.loss_scaler = DynamicLossScaler()
+        else:
+            self.dynamic_loss_scale = False
+            self.loss_scaler = LossScaler(static_loss_scale)
+
+        self.overflow = False
+        self.first_closure_call_this_step = True
+
+        self.clip_grad_norm = nn.utils.clip_grad.clip_grad_norm_
+
+    def maybe_print(self, msg):
+        if self.verbose:
+            print(msg)
+
+    def __getstate__(self):
+        raise RuntimeError(
+            'FP16_Optimizer should be serialized using state_dict().')
+
+    def __setstate__(self, state):
+        raise RuntimeError(
+            'FP16_Optimizer should be deserialized using load_state_dict().')
+
+    def zero_grad(self, set_grads_to_None=False):
+        """
+        Zero fp32 and fp16 parameter grads.
+        """
+        # In principle, only the .grad attributes of the model params need to be zeroed,
+        # because gradients are copied into the FP32 master params.  However, we zero
+        # all gradients owned by the optimizer, just to be safe:
+        for group in self.optimizer.param_groups:
+            for p in group['params']:
+                if set_grads_to_None:
+                    p.grad = None
+                else:
+                    if p.grad is not None:
+                        p.grad.detach_()
+                        p.grad.zero_()
+
+        # Zero fp16 gradients owned by the model:
+        for fp16_group in self.fp16_groups:
+            for param in fp16_group:
+                if set_grads_to_None:
+                    param.grad = None
+                else:
+                    if param.grad is not None:
+                        param.grad.detach_(
+                        )  # as in torch.optim.optimizer.zero_grad()
+                        param.grad.zero_()
+
+    def _check_overflow(self):
+        params = []
+        for group in self.fp16_groups:
+            for param in group:
+                params.append(param)
+        for group in self.fp32_from_fp32_groups:
+            for param in group:
+                params.append(param)
+        self.overflow = self.loss_scaler.has_overflow(params)
+
+    def _update_scale(self, has_overflow=False):
+        self.loss_scaler.update_scale(has_overflow)
+
+    def _master_params_to_model_params(self):
+        for fp16_group, fp32_from_fp16_group in zip(
+                self.fp16_groups, self.fp32_from_fp16_groups):
+            master_params_to_model_params(fp16_group, fp32_from_fp16_group)
+
+    def _model_params_to_master_params(self):
+        for fp16_group, fp32_from_fp16_group in zip(
+                self.fp16_groups, self.fp32_from_fp16_groups):
+            master_params_to_model_params(fp32_from_fp16_group, fp16_group)
+
+    # To consider:  Integrate distributed with this wrapper by registering a hook on each variable
+    # that does the overflow check, gradient copy + downscale, and fp32 allreduce in a different stream.
+    def _model_grads_to_master_grads(self):
+        for fp16_group, fp32_from_fp16_group in zip(
+                self.fp16_groups, self.fp32_from_fp16_groups):
+            model_grads_to_master_grads(fp16_group, fp32_from_fp16_group)
+
+    def _downscale_master(self):
+        if self.loss_scale != 1.0:
+            for group in self.optimizer.param_groups:
+                for param in group['params']:
+                    if param.grad is not None:
+                        param.grad.data.mul_(1. / self.loss_scale)
+
+    def clip_master_grads(self, max_norm, norm_type=2):
+        """
+        Clips fp32 master gradients via ``torch.nn.utils.clip_grad_norm``.
+
+        Args:
+            max_norm (float or int): max norm of the gradients
+            norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
+                infinity norm.
+
+        Returns:
+            Total norm of the current fp32 gradients (viewed as a single vector).
+
+        .. warning::
+            Returns -1 if the most recently computed fp16 gradients overflowed (that is, if ``self.overflow`` is ``True``). # noqa
+        """
+        if not self.overflow:
+            fp32_params = []
+            for param_group in self.optimizer.param_groups:
+                for param in param_group['params']:
+                    fp32_params.append(param)
+            return self.clip_grad_norm(fp32_params, max_norm, norm_type)
+        else:
+            return -1
+
+    def state_dict(self):
+        """
+        Returns a dict containing the current state of this :class:`FP16_Optimizer` instance.
+        This dict contains attributes of :class:`FP16_Optimizer`, as well as the state_dict
+        of the contained Pytorch optimizer.
+        Example::
+
+            checkpoint = {}
+            checkpoint['model'] = model.state_dict()
+            checkpoint['optimizer'] = optimizer.state_dict()
+            torch.save(checkpoint, "saved.pth")
+        """
+        state_dict = {}
+        state_dict['loss_scaler'] = self.loss_scaler
+        state_dict['dynamic_loss_scale'] = self.dynamic_loss_scale
+        state_dict['overflow'] = self.overflow
+        state_dict[
+            'first_closure_call_this_step'] = self.first_closure_call_this_step
+        state_dict['optimizer_state_dict'] = self.optimizer.state_dict()
+        state_dict['fp32_from_fp16'] = self.fp32_from_fp16_groups
+        return state_dict
+
+    def load_state_dict(self, state_dict):
+        """
+        Loads a state_dict created by an earlier call to state_dict().
+        If ``fp16_optimizer_instance`` was constructed from some ``init_optimizer``,
+        whose parameters in turn came from ``model``, it is expected that the user
+        will call ``model.load_state_dict()`` before
+        ``fp16_optimizer_instance.load_state_dict()`` is called.
+
+        Example::
+
+            model = torch.nn.Linear(D_in, D_out).cuda().half()
+            optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
+            optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
+            ...
+            checkpoint = torch.load("saved.pth")
+            model.load_state_dict(checkpoint['model'])
+            optimizer.load_state_dict(checkpoint['optimizer'])
+        """
+        # I think it should actually be ok to reload the optimizer before the model.
+        self.loss_scaler = state_dict['loss_scaler']
+        self.dynamic_loss_scale = state_dict['dynamic_loss_scale']
+        self.overflow = state_dict['overflow']
+        self.first_closure_call_this_step = state_dict[
+            'first_closure_call_this_step']
+        self.optimizer.load_state_dict(state_dict['optimizer_state_dict'])
+        # At this point, the optimizer's references to the model's fp32 parameters are up to date.
+        # The optimizer's hyperparameters and internal buffers are also up to date.
+        # However, the fp32 master copies of the model's fp16 params stored by the optimizer are still
+        # out of date.  There are two options.
+        # 1:  Refresh the master params from the model's fp16 params.
+        # This requires less storage but incurs precision loss.
+        # 2:  Save and restore the fp32 master copies separately.
+        # We choose option 2.
+        #
+        # Pytorch Optimizer.load_state_dict casts saved buffers (e.g. momentum) to the type and device
+        # of their associated parameters, because it's possible those buffers might not exist yet in
+        # the current optimizer instance.  In our case, as long as the current FP16_Optimizer has been
+        # constructed in the same way as the one whose state_dict we are loading, the same master params
+        # are guaranteed to exist, so we can just copy_() from the saved master params.
+        for current_group, saved_group in zip(self.fp32_from_fp16_groups,
+                                              state_dict['fp32_from_fp16']):
+            for current, saved in zip(current_group, saved_group):
+                current.data.copy_(saved.data)
+
+    def step(self, closure=None):  # could add clip option.
+        """
+        If no closure is supplied, :attr:`step` should be called after
+        ``fp16_optimizer_obj.backward(loss)``.
+        :attr:`step` updates the fp32 master copy of parameters using the optimizer supplied to
+        :class:`FP16_Optimizer`'s constructor, then copies the updated fp32 params into the fp16 params
+        originally referenced by :class:`FP16_Optimizer`'s constructor, so the user may immediately run
+        another forward pass using their model.
+
+        If a closure is supplied, :attr:`step` may be called without a prior call to
+        :attr:`backward(loss)`.
+        This control flow is identical to `ordinary Pytorch optimizer use`_ with closures.
+        However, the user should take care that any ``loss.backward()`` call within the closure
+        has been replaced by ``fp16_optimizer_obj.backward(loss)``.
+
+        Args:
+           closure (optional):  Closure that will be supplied to the underlying optimizer originally passed to :class:`FP16_Optimizer`'s constructor.  closure should call :attr:`zero_grad()` on the :class:`FP16_Optimizer` object, compute the loss, call :attr:`backward(loss)`, and return the loss. # noqa
+
+        Example with closure::
+
+            # optimizer is assumed to be an FP16_Optimizer object, previously constructed from an
+            # existing pytorch optimizer.
+            for input, target in dataset:
+                def closure():
+                    optimizer.zero_grad()
+                    output = model(input)
+                    loss = loss_fn(output, target)
+                    # loss.backward() becomes:
+                    optimizer.backward(loss)
+                    return loss
+                optimizer.step(closure)
+
+        .. warning::
+            Currently, calling :attr:`step` with a closure is not compatible with dynamic loss scaling.
+
+        .. _`ordinary Pytorch optimizer use`:
+            http://pytorch.org/docs/master/optim.html#optimizer-step-closure
+        """
+
+        scale = self.loss_scaler.loss_scale
+        self._update_scale(self.overflow)
+
+        if self.overflow:
+            self.maybe_print(
+                'OVERFLOW! Skipping step. Attempted loss scale: {}, reducing to {}'
+                .format(scale, self.loss_scale))
+            return
+
+        if closure is not None:
+            retval = self._step_with_closure(closure)
+        else:
+            retval = self.optimizer.step()
+
+        self._master_params_to_model_params()
+
+        return retval
+
+    def _step_with_closure(self, closure):
+
+        def wrapped_closure():
+            # helpful for debugging
+            # print("Calling wrapped_closure, first_closure_call_this_step = {}"
+            #       .format(self.first_closure_call_this_step))
+            if self.first_closure_call_this_step:
+                # We expect that the fp16 params are initially fresh on entering self.step(),
+                # so _master_params_to_model_params() is unnecessary the first time wrapped_closure()
+                # is called within self.optimizer.step().
+                self.first_closure_call_this_step = False
+            else:
+                # If self.optimizer.step() internally calls wrapped_closure more than once,
+                # it may update the fp32 params after each call.  However, self.optimizer
+                # doesn't know about the fp16 params at all.  If the fp32 params get updated,
+                # we can't rely on self.optimizer to refresh the fp16 params.  We need
+                # to handle that manually:
+                self._master_params_to_model_params()
+            # Our API expects the user to give us ownership of the backward() call by
+            # replacing all calls to loss.backward() with optimizer.backward(loss).
+            # This requirement holds whether or not the call to backward() is made within a closure.
+            # If the user is properly calling optimizer.backward(loss) within "closure,"
+            # calling closure() here will give the fp32 master params fresh gradients
+            # for the optimizer to play with, so all wrapped_closure needs to do is call
+            # closure() and return the loss.
+            temp_loss = closure()
+            while (self.overflow):
+                scale = self.loss_scaler.loss_scale
+                self._update_scale(self.overflow)
+                self.maybe_print(
+                    'OVERFLOW within closure! Skipping step. Attempted loss scale: {}, '
+                    'reducing to {}'.format(scale, self.loss_scale))
+                temp_loss = closure()
+            return temp_loss
+
+        retval = self.optimizer.step(wrapped_closure)
+
+        self.first_closure_call_this_step = True
+
+        return retval
+
+    def backward(self, loss, update_master_grads=True, retain_graph=False):
+        """
+        :attr:`backward` performs the following conceptual steps:
+
+        1. fp32_loss = loss.float() (see first Note below)
+        2. scaled_loss = fp32_loss*loss_scale
+        3. scaled_loss.backward(), which accumulates scaled gradients into the ``.grad`` attributes of the model's leaves (which may be fp16, fp32, or a mixture, depending how your model was defined). # noqa
+        4. fp16 grads are then copied to the master params' ``.grad`` attributes (see second Note), which are guaranteed to be fp32. # noqa
+        5. Finally, master grads are divided by loss_scale.
+
+        In this way, after :attr:`backward`, the master params have fresh gradients,
+        and :attr:`step` may be called.
+
+        .. note::
+            :attr:`backward` internally converts the loss to fp32 before applying the loss scale.
+            This provides some additional safety against overflow if the user has supplied an
+            fp16 loss value.
+            However, for maximum overflow safety, the user should
+            compute the loss criterion (MSE, cross entropy, etc) in fp32 before supplying it to
+            :attr:`backward`.
+
+        .. warning::
+            The gradients found in a model's leaves after the call to
+            :attr:`backward` should not be regarded as valid in general,
+            because it's possible
+            they have been scaled (and in the case of dynamic loss scaling,
+            the scale factor may change over time).
+            If the user wants to inspect gradients after a call to :attr:`backward`,
+            only the master gradients should be regarded as valid.  These can be retrieved via
+            :attr:`inspect_master_grad_data()`.
+
+        Args:
+            loss:  The loss output by the user's model.  loss may be either float or half (but see first Note above).
+            update_master_grads (bool, optional, default=True):  Option to copy fp16 grads to fp32 grads on this call.  By setting this to False, the user can delay the copy, which is useful to eliminate redundant fp16->fp32 grad copies if :attr:`backward` is being called on multiple losses in one iteration.  If set to False, the user becomes responsible for calling :attr:`update_master_grads` before calling :attr:`step`. # noqa
+            retain_graph (bool, optional, default=False):  Forwards the usual ``retain_graph=True`` option to the internal call to ``loss.backward``.  If ``retain_graph`` is being used to accumulate gradient values from multiple backward passes before calling ``optimizer.step``, passing ``update_master_grads=False`` is also recommended (see Example below). # noqa
+
+        Example::
+
+            # Ordinary operation:
+            optimizer.backward(loss)
+
+            # Naive operation with multiple losses (technically valid, but less efficient):
+            # fp32 grads will be correct after the second call,  but
+            # the first call incurs an unnecessary fp16->fp32 grad copy.
+            optimizer.backward(loss1)
+            optimizer.backward(loss2)
+
+            # More efficient way to handle multiple losses:
+            # The fp16->fp32 grad copy is delayed until fp16 grads from all
+            # losses have been accumulated.
+            optimizer.backward(loss1, update_master_grads=False)
+            optimizer.backward(loss2, update_master_grads=False)
+            optimizer.update_master_grads()
+        """
+        # To consider:  try multiple backward passes using retain_grad=True to find
+        # a loss scale that works.  After you find a loss scale that works, do a final dummy
+        # backward pass with retain_graph=False to tear down the graph.  Doing this would avoid
+        # discarding the iteration,  but probably wouldn't improve overall efficiency.
+        self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
+        if update_master_grads:
+            self.update_master_grads()
+
+    def update_master_grads(self):
+        """
+        Copy the ``.grad`` attribute from stored references to fp16 parameters to
+        the ``.grad`` attribute of the fp32 master parameters that are directly
+        updated by the optimizer.  :attr:`update_master_grads` only needs to be called if
+        ``fp16_optimizer_obj.backward`` was called with ``update_master_grads=False``.
+        """
+        if self.dynamic_loss_scale:
+            self._check_overflow()
+            if self.overflow: return  # noqa
+        self._model_grads_to_master_grads()
+        self._downscale_master()
+
+    def inspect_master_grad_data(self):
+        """
+        When running with :class:`FP16_Optimizer`,
+        ``.grad`` attributes of a model's fp16 leaves should not be
+        regarded as truthful, because they might be scaled.
+        After a call to :attr:`fp16_optimizer_obj.backward(loss)`, if no overflow was encountered,
+        the fp32 master params' ``.grad``
+        attributes will contain valid gradients properly divided by the loss scale.  However,
+        because :class:`FP16_Optimizer` flattens some parameters, accessing them may be
+        nonintuitive.  :attr:`inspect_master_grad_data`
+        allows those gradients to be viewed with shapes corresponding to their associated model leaves.
+
+        Returns:
+            List of lists (one list for each parameter group).  The list for each parameter group
+            is a list of the ``.grad.data`` attributes of the fp32 master params belonging to that group.
+        """
+        if self.overflow:
+            print(
+                'Warning:  calling FP16_Optimizer.inspect_master_grad_data while in an overflow state.  '
+                'Gradients are currently invalid (may be inf, nan, or stale).  Returning None.'
+            )
+            return None
+        else:
+            # The optimizer owns only references to master params.
+            master_grads_data = []
+            for param_group in self.optimizer.param_groups:
+                master_grads_this_group = []
+                for param in param_group['params']:
+                    if param.grad is not None:
+                        master_grads_this_group.append(param.grad.data)
+                    else:
+                        master_grads_this_group.append(None)
+                master_grads_data.append(master_grads_this_group)
+            return master_grads_data
+
+    # Promote loss scale so it can be retrieved or set via "fp16_optimizer_instance.loss_scale"
+    def _get_loss_scale(self):
+        return self.loss_scaler.loss_scale
+
+    def _set_loss_scale(self, value):
+        self.loss_scaler.cur_scale = value
+
+    loss_scale = property(_get_loss_scale, _set_loss_scale)
+
+    # Promote state so it can be retrieved or set via "fp16_optimizer_instance.state"
+    def _get_state(self):
+        return self.optimizer.state
+
+    def _set_state(self, value):
+        self.optimizer.state = value
+
+    state = property(_get_state, _set_state)
+
+    # Promote param_groups so it can be retrieved or set via "fp16_optimizer_instance.param_groups"
+    # (for example, to adjust the learning rate)
+    def _get_param_groups(self):
+        return self.optimizer.param_groups
+
+    def _set_param_groups(self, value):
+        self.optimizer.param_groups = value
+
+    param_groups = property(_get_param_groups, _set_param_groups)
diff --git a/modelscope/utils/multi_modal/fp16/fp16util.py b/modelscope/utils/multi_modal/fp16/fp16util.py
new file mode 100644
index 00000000..29595a6c
--- /dev/null
+++ b/modelscope/utils/multi_modal/fp16/fp16util.py
@@ -0,0 +1,216 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+from torch.autograd import Variable
+
+
+class tofp16(nn.Module):
+    """
+    Utility module that implements::
+
+        def forward(self, input):
+            return input.half()
+    """
+
+    def __init__(self):
+        super(tofp16, self).__init__()
+
+    def forward(self, input):
+        return input.half()
+
+
+def BN_convert_float(module):
+    """
+    Utility function for network_to_half().
+
+    Retained for legacy purposes.
+    """
+    if isinstance(
+            module,
+            torch.nn.modules.batchnorm._BatchNorm) and module.affine is True:
+        module.float()
+    for child in module.children():
+        BN_convert_float(child)
+    return module
+
+
+def network_to_half(network):
+    """
+    Convert model to half precision in a batchnorm-safe way.
+
+    Retained for legacy purposes. It is recommended to use FP16Model.
+    """
+    return nn.Sequential(tofp16(), BN_convert_float(network.half()))
+
+
+def convert_module(module, dtype):
+    """
+    Converts a module's immediate parameters and buffers to dtype.
+    """
+    for param in module.parameters(recurse=False):
+        if param is not None:
+            if param.data.dtype.is_floating_point:
+                param.data = param.data.to(dtype=dtype)
+            if param._grad is not None and param._grad.data.dtype.is_floating_point:
+                param._grad.data = param._grad.data.to(dtype=dtype)
+
+    for buf in module.buffers(recurse=False):
+        if buf is not None and buf.data.dtype.is_floating_point:
+            buf.data = buf.data.to(dtype=dtype)
+
+
+def convert_network(network, dtype):
+    """
+    Converts a network's parameters and buffers to dtype.
+    """
+    for module in network.modules():
+        if isinstance(module, torch.nn.modules.batchnorm._BatchNorm
+                      ) and module.affine is True:
+            continue
+        convert_module(module, dtype)
+    return network
+
+
+class FP16Model(nn.Module):
+    """
+    Convert model to half precision in a batchnorm-safe way.
+    """
+
+    def __init__(self, network):
+        super(FP16Model, self).__init__()
+        self.network = convert_network(network, dtype=torch.half)
+
+    def forward(self, *inputs):
+        inputs = tuple(t.half() for t in inputs)
+        return self.network(*inputs)
+
+
+def backwards_debug_hook(grad):
+    raise RuntimeError(
+        'master_params recieved a gradient in the backward pass!')
+
+
+def prep_param_lists(model, flat_master=False):
+    """
+    Creates a list of FP32 master parameters for a given model, as in
+    `Training Neural Networks with Mixed Precision:  Real Examples`_.
+
+    Args:
+        model (torch.nn.Module): Existing Pytorch model
+        flat_master (bool, optional, default=False):  Flatten the master parameters into a single tensor, as a performance optimization.  # noqa
+    Returns:
+        A tuple (``model_params``, ``master_params``). ``model_params`` is a list of the model's parameters for later use with :func:`model_grads_to_master_grads` and :func:`master_params_to_model_params`.  ``master_params`` is a list of FP32 master gradients.  If ``flat_master=True``, ``master_params`` will be a list with one element. # noqa
+
+    Example::
+
+        model_params, master_params = prep_param_lists(model)
+
+    .. warning::
+        Currently, if ``flat_master=True``, all the model's parameters must be the same type.  If the model has parameters of different types, use ``flat_master=False``, or use :class:`FP16_Optimizer`. # noqa
+
+    .. _`Training Neural Networks with Mixed Precision:  Real Examples`:
+        http://on-demand.gputechconf.com/gtc/2018/video/S81012/
+    """
+    model_params = [
+        param for param in model.parameters() if param.requires_grad
+    ]
+
+    if flat_master:
+        # Give the user some more useful error messages
+        try:
+            # flatten_dense_tensors returns a contiguous flat array.
+            # http://pytorch.org/docs/master/_modules/torch/_utils.html
+            master_params = _flatten_dense_tensors(
+                [param.data for param in model_params]).float()
+        except:  # noqa
+            print(
+                'Error in prep_param_lists:  model may contain a mixture of parameters '
+                'of different types.  Use flat_master=False, or use F16_Optimizer.'
+            )
+            raise
+        master_params = torch.nn.Parameter(master_params)
+        master_params.requires_grad = True
+        # master_params.register_hook(backwards_debug_hook)
+        if master_params.grad is None:
+            master_params.grad = master_params.new(*master_params.size())
+        return model_params, [master_params]
+    else:
+        master_params = [
+            param.clone().float().detach() for param in model_params
+        ]
+        for param in master_params:
+            param.requires_grad = True
+        return model_params, master_params
+
+
+def model_grads_to_master_grads(model_params,
+                                master_params,
+                                flat_master=False):
+    """
+    Copy model gradients to master gradients.
+
+    Args:
+        model_params:  List of model parameters created by :func:`prep_param_lists`.
+        master_params:  List of FP32 master parameters created by :func:`prep_param_lists`.  If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`model_grads_to_master_grads`. # noqa
+    """
+    if flat_master:
+        # The flattening may incur one more deep copy than is necessary.
+        master_params[0].grad.data.copy_(
+            _flatten_dense_tensors([p.grad.data for p in model_params]))
+    else:
+        for model, master in zip(model_params, master_params):
+            if model.grad is not None:
+                if master.grad is None:
+                    master.grad = Variable(
+                        master.data.new(*master.data.size()))
+                master.grad.data.copy_(model.grad.data)
+            else:
+                master.grad = None
+
+
+def master_params_to_model_params(model_params,
+                                  master_params,
+                                  flat_master=False):
+    """
+    Copy master parameters to model parameters.
+
+    Args:
+        model_params:  List of model parameters created by :func:`prep_param_lists`.
+        master_params:  List of FP32 master parameters created by :func:`prep_param_lists`.  If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`master_params_to_model_params`. # noqa
+    """
+    if flat_master:
+        for model, master in zip(
+                model_params,
+                _unflatten_dense_tensors(master_params[0].data, model_params)):
+            model.data.copy_(master)
+    else:
+        for model, master in zip(model_params, master_params):
+            model.data.copy_(master.data)
+
+
+# Backward compatibility fixes
+
+
+def to_python_float(t):
+    if hasattr(t, 'item'):
+        return t.item()
+    else:
+        return t[0]
+
+
+TORCH_MAJOR = int(torch.__version__.split('.')[0])
+TORCH_MINOR = int(torch.__version__.split('.')[1])
diff --git a/modelscope/utils/multi_modal/fp16/loss_scaler.py b/modelscope/utils/multi_modal/fp16/loss_scaler.py
new file mode 100755
index 00000000..fc55a4ed
--- /dev/null
+++ b/modelscope/utils/multi_modal/fp16/loss_scaler.py
@@ -0,0 +1,237 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+
+# item() is a recent addition, so this helps with backward compatibility.
+def to_python_float(t):
+    if hasattr(t, 'item'):
+        return t.item()
+    else:
+        return t[0]
+
+
+class LossScaler:
+    """
+    Class that manages a static loss scale.  This class is intended to interact with
+    :class:`FP16_Optimizer`, and should not be directly manipulated by the user.
+
+    Use of :class:`LossScaler` is enabled via the ``static_loss_scale`` argument to
+    :class:`FP16_Optimizer`'s constructor.
+
+    Args:
+        scale (float, optional, default=1.0):  The loss scale.
+    """
+
+    def __init__(self, scale=1):
+        self.cur_scale = scale
+
+    # `params` is a list / generator of torch.Variable
+    def has_overflow(self, params):
+        return False
+
+    # `x` is a torch.Tensor
+    def _has_inf_or_nan(x):
+        return False
+
+    def update_scale(self, overflow):
+        pass
+
+    @property
+    def loss_scale(self):
+        return self.cur_scale
+
+    def scale_gradient(self, module, grad_in, grad_out):
+        return tuple(self.loss_scale * g for g in grad_in)
+
+    def backward(self, loss, retain_graph=False):
+        scaled_loss = loss * self.loss_scale
+        scaled_loss.backward(retain_graph=retain_graph)
+
+
+class DynamicLossScaler:
+    """
+    Class that manages dynamic loss scaling.  It is recommended to use :class:`DynamicLossScaler`
+    indirectly, by supplying ``dynamic_loss_scale=True`` to the constructor of
+    :class:`FP16_Optimizer`.  However, it's important to understand how :class:`DynamicLossScaler`
+    operates, because the default options can be changed using the
+    the ``dynamic_loss_args`` argument to :class:`FP16_Optimizer`'s constructor.
+
+    Loss scaling is designed to combat the problem of underflowing gradients encountered at long
+    times when training fp16 networks.  Dynamic loss scaling begins by attempting a very high loss
+    scale.  Ironically, this may result in OVERflowing gradients.  If overflowing gradients are
+    encountered, :class:`DynamicLossScaler` informs :class:`FP16_Optimizer` that an overflow has
+    occurred.
+    :class:`FP16_Optimizer` then skips the update step for this particular iteration/minibatch,
+    and :class:`DynamicLossScaler` adjusts the loss scale to a lower value.
+    If a certain number of iterations occur without overflowing gradients detected,
+    :class:`DynamicLossScaler` increases the loss scale once more.
+    In this way :class:`DynamicLossScaler` attempts to "ride the edge" of
+    always using the highest loss scale possible without incurring overflow.
+
+    Args:
+        init_scale (float, optional, default=2**32):  Initial loss scale attempted by :class:`DynamicLossScaler.`
+        scale_factor (float, optional, default=2.0):  Factor used when adjusting the loss scale. If an overflow is encountered, the loss scale is readjusted to loss scale/``scale_factor``.  If ``scale_window`` consecutive iterations take place without an overflow, the loss scale is readjusted to loss_scale*``scale_factor``.  # noqa
+        scale_window (int, optional, default=1000):  Number of consecutive iterations without an overflow to wait before increasing the loss scale.  # noqa
+    """
+
+    def __init__(self,
+                 init_scale=2**32,
+                 scale_factor=2.,
+                 scale_window=1000,
+                 min_scale=1,
+                 delayed_shift=1,
+                 consecutive_hysteresis=False):
+        self.cur_scale = init_scale
+        self.cur_iter = 0
+        self.last_overflow_iter = -1
+        self.scale_factor = scale_factor
+        self.scale_window = scale_window
+        self.min_scale = min_scale
+        self.delayed_shift = delayed_shift
+        self.cur_hysteresis = delayed_shift
+        self.consecutive_hysteresis = consecutive_hysteresis
+
+    # `params` is a list / generator of torch.Variable
+    def has_overflow_serial(self, params):
+        for p in params:
+            if p.grad is not None and DynamicLossScaler._has_inf_or_nan(
+                    p.grad.data):
+                return True
+
+        return False
+
+    def has_overflow(self, params):
+        overflow = self.has_overflow_serial(params)
+        overflow_gpu = torch.cuda.ByteTensor([overflow])
+        overflow = overflow_gpu[0].item()
+        return bool(overflow)
+
+    # `x` is a torch.Tensor
+    def _has_inf_or_nan(x):
+        try:
+            # if x is half, the .float() incurs an additional deep copy, but it's necessary if
+            # Pytorch's .sum() creates a one-element tensor of the same type as x
+            # (which is true for some recent version of pytorch).
+            cpu_sum = float(x.float().sum())
+            # More efficient version that can be used if .sum() returns a Python scalar
+            # cpu_sum = float(x.sum())
+        except RuntimeError as instance:
+            # We want to check if inst is actually an overflow exception.
+            # RuntimeError could come from a different error.
+            # If so, we still want the exception to propagate.
+            if 'value cannot be converted' not in instance.args[0]:
+                raise
+            return True
+        else:
+            if cpu_sum == float(
+                    'inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
+                return True
+            return False
+
+    # `overflow` is boolean indicating whether the gradient overflowed
+    def update_scale(self, overflow):
+
+        if not hasattr(self, 'min_scale'):
+            self.min_scale = 1
+        if not hasattr(self, 'delayed_shift'):
+            self.delayed_shift = 1
+        if not hasattr(self, 'cur_hysteresis'):
+            self.cur_hysteresis = 1
+        if not hasattr(self, 'consecutive_hysteresis'):
+            self.consecutive_hysteresis = True
+        if overflow:
+            # self.cur_scale /= self.scale_factor
+            if self.delayed_shift == 1 or self.cur_hysteresis == 1:
+                self.cur_scale = max(self.cur_scale / self.scale_factor,
+                                     self.min_scale)
+            else:
+                self.cur_hysteresis -= 1
+            self.last_overflow_iter = self.cur_iter
+        else:
+            if self.consecutive_hysteresis:
+                self.cur_hysteresis = self.delayed_shift
+            if (self.cur_iter
+                    - self.last_overflow_iter) % self.scale_window == 0:
+                if not self.consecutive_hysteresis:
+                    self.cur_hysteresis = self.delayed_shift
+                self.cur_scale *= self.scale_factor
+        self.cur_iter += 1
+
+    @property
+    def loss_scale(self):
+        return self.cur_scale
+
+    def scale_gradient(self, module, grad_in, grad_out):
+        return tuple(self.loss_scale * g for g in grad_in)
+
+    def backward(self, loss, retain_graph=False):
+        scaled_loss = loss * self.loss_scale
+        scaled_loss.backward(retain_graph=retain_graph)
+
+
+##############################################################
+# Example usage below here -- assuming it's in a separate file
+##############################################################
+"""
+TO-DO separate out into an example.
+if __name__ == "__main__":
+    import torch
+    from torch.autograd import Variable
+    from dynamic_loss_scaler import DynamicLossScaler
+
+    # N is batch size; D_in is input dimension;
+    # H is hidden dimension; D_out is output dimension.
+    N, D_in, H, D_out = 64, 1000, 100, 10
+
+    # Create random Tensors to hold inputs and outputs, and wrap them in Variables.
+    x = Variable(torch.randn(N, D_in), requires_grad=False)
+    y = Variable(torch.randn(N, D_out), requires_grad=False)
+
+    w1 = Variable(torch.randn(D_in, H), requires_grad=True)
+    w2 = Variable(torch.randn(H, D_out), requires_grad=True)
+    parameters = [w1, w2]
+
+    learning_rate = 1e-6
+    optimizer = torch.optim.SGD(parameters, lr=learning_rate)
+    loss_scaler = DynamicLossScaler()
+
+    for t in range(500):
+        y_pred = x.mm(w1).clamp(min=0).mm(w2)
+        loss = (y_pred - y).pow(2).sum() * loss_scaler.loss_scale
+        print('Iter {} loss scale: {}'.format(t, loss_scaler.loss_scale))
+        print('Iter {} scaled loss: {}'.format(t, loss.data[0]))
+        print('Iter {} unscaled loss: {}'.format(t, loss.data[0] / loss_scaler.loss_scale))
+
+        # Run backprop
+        optimizer.zero_grad()
+        loss.backward()
+
+        # Check for overflow
+        has_overflow = DynamicLossScaler.has_overflow(parameters)
+
+        # If no overflow, unscale grad and update as usual
+        if not has_overflow:
+            for param in parameters:
+                param.grad.data.mul_(1. / loss_scaler.loss_scale)
+            optimizer.step()
+        # Otherwise, don't do anything -- ie, skip iteration
+        else:
+            print('OVERFLOW!')
+
+        # Update loss scale for next iteration
+        loss_scaler.update_scale(has_overflow)
+
+"""
diff --git a/tests/pipelines/test_ofa_tasks.py b/tests/pipelines/test_ofa_tasks.py
index ab10f573..8ee5f2ef 100644
--- a/tests/pipelines/test_ofa_tasks.py
+++ b/tests/pipelines/test_ofa_tasks.py
@@ -172,6 +172,7 @@ class OfaTasksTest(unittest.TestCase):
         ofa_pipe = pipeline(Tasks.visual_grounding, model=model)
         image = 'data/test/images/visual_grounding.png'
         text = '一个圆头的蓝色宝可梦'
+        text = '火'
         input = {'image': image, 'text': text}
         result = ofa_pipe(input)
         print(result)
diff --git a/tests/trainers/test_ofa_trainer.py b/tests/trainers/test_ofa_trainer.py
new file mode 100644
index 00000000..bfec1b85
--- /dev/null
+++ b/tests/trainers/test_ofa_trainer.py
@@ -0,0 +1,20 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import shutil
+import unittest
+
+from modelscope.trainers.multi_modal.ofa import OFATrainer
+from modelscope.utils.test_utils import test_level
+
+
+class TestOfaTrainer(unittest.TestCase):
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_trainer(self):
+        model_id = '/apsarapangu/disk2/yichang.zyc/ckpt/MaaS/ofa_text-classification_mnli_large_en'
+        self.trainer = OFATrainer(model_id)
+        self.trainer.train()
+        shutil.rmtree(self.trainer.save_dir)
+
+
+if __name__ == '__main__':
+    unittest.main()

From db534fe946697cffdfb80472492831f8cd18b7f9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=A1=8C=E5=97=94?= <yichang.zyc@alibaba-inc.com>
Date: Sun, 4 Sep 2022 16:08:25 +0800
Subject: [PATCH 03/54] add hf dataset

---
 .../models/multi_modal/ofa_for_all_tasks.py   |   6 +-
 modelscope/preprocessors/ofa/base.py          |   3 +-
 .../multi_modal/ofa/ofa_file_dataset.py       |   2 +-
 .../trainers/multi_modal/ofa/ofa_trainer.py   | 140 +++++++-----------
 .../multi_modal/ofa/ofa_trainer_old.py        | 120 +++++++++++++++
 .../multi_modal/ofa/ofa_trainer_utils.py      |  34 +++--
 tests/trainers/test_ofa_trainer.py            |   4 +-
 7 files changed, 201 insertions(+), 108 deletions(-)
 create mode 100644 modelscope/trainers/multi_modal/ofa/ofa_trainer_old.py

diff --git a/modelscope/models/multi_modal/ofa_for_all_tasks.py b/modelscope/models/multi_modal/ofa_for_all_tasks.py
index 80471e3c..4528a9da 100644
--- a/modelscope/models/multi_modal/ofa_for_all_tasks.py
+++ b/modelscope/models/multi_modal/ofa_for_all_tasks.py
@@ -287,5 +287,7 @@ class OfaForAllTasks(TorchModel):
 
     def load_ans2label(self):
         if self.cfg.model.get('answer2label', None):
-            filename = osp.join(self.model_dir, self.cfg.model.answer2label)
-            self.ans2label_dict = json.load(open(filename))
+            ans2label_file = osp.join(self.model_dir,
+                                      self.cfg.model.answer2label)
+            with open(ans2label_file, 'r') as reader:
+                self.ans2label_dict = json.load(reader)
diff --git a/modelscope/preprocessors/ofa/base.py b/modelscope/preprocessors/ofa/base.py
index bb47c411..8bbe02d1 100644
--- a/modelscope/preprocessors/ofa/base.py
+++ b/modelscope/preprocessors/ofa/base.py
@@ -61,7 +61,8 @@ class OfaBasePreprocessor:
         self.index2ans = {}
         if self.cfg.model.get('answer2label', False):
             ans2label_file = osp.join(model_dir, self.cfg.model.answer2label)
-            ans2label_dict = json.load(open(ans2label_file, 'r'))
+            with open(ans2label_file, 'r') as reader:
+                ans2label_dict = json.load(reader)
             self.constraint_trie = Trie(tokenizer.eos_token_id)
             for i, answer in enumerate(ans2label_dict.keys()):
                 answer_item = tokenizer(
diff --git a/modelscope/trainers/multi_modal/ofa/ofa_file_dataset.py b/modelscope/trainers/multi_modal/ofa/ofa_file_dataset.py
index 17c9398a..138f1303 100644
--- a/modelscope/trainers/multi_modal/ofa/ofa_file_dataset.py
+++ b/modelscope/trainers/multi_modal/ofa/ofa_file_dataset.py
@@ -79,7 +79,7 @@ class OFAFileDataset:
                 self.total_row_count += 1
                 offset += len(line.encode('utf-8'))
             pickle.dump(self.lineid_to_offset,
-                        open('{}.index'.format(self.file_path), 'rb'))
+                        open('{}.index'.format(self.file_path), 'wb'))
         self._compute_start_pos_and_row_count()
         print(
             'local datafile {} slice_id {} finished initializing row_count and line_idx-to-offset mapping'
diff --git a/modelscope/trainers/multi_modal/ofa/ofa_trainer.py b/modelscope/trainers/multi_modal/ofa/ofa_trainer.py
index af2fca0a..fae79a74 100644
--- a/modelscope/trainers/multi_modal/ofa/ofa_trainer.py
+++ b/modelscope/trainers/multi_modal/ofa/ofa_trainer.py
@@ -1,120 +1,84 @@
 import os
-from os import path as osp
 from typing import Dict, Optional
 
-import torch
-import torch.distributed as dist
-import transformers
-from torch.utils.data import DataLoader
-from torch.utils.data.distributed import DistributedSampler
+from datasets import load_dataset
 
 from modelscope.metainfo import Trainers
 from modelscope.models.base import Model
+from modelscope.msdatasets.ms_dataset import MsDataset
 from modelscope.preprocessors.multi_modal import OfaPreprocessor
 from modelscope.preprocessors.ofa.utils.collate import collate_fn
-from modelscope.trainers.base import BaseTrainer
+from modelscope.trainers import EpochBasedTrainer
 from modelscope.trainers.builder import TRAINERS
+from modelscope.trainers.optimizer.builder import build_optimizer
+from modelscope.utils.config import Config
 from modelscope.utils.constant import ModeKeys, ModelFile
-from modelscope.utils.logger import get_logger
-from modelscope.utils.torch_utils import init_dist
 from .ofa_trainer_utils import (AdjustLabelSmoothedCrossEntropyCriterion,
                                 OFADataset, get_schedule)
 
-logger = get_logger()
-
 
 @TRAINERS.register_module(module_name=Trainers.ofa_tasks)
-class OFATrainer(BaseTrainer):
+class OFATrainer(EpochBasedTrainer):
 
     def __init__(self, model: str, *args, **kwargs):
+        # import pdb
+        # pdb.set_trace()
         model = Model.from_pretrained(model)
-        super().__init__(osp.join(model.model_dir, ModelFile.CONFIGURATION))
-        self.model_dir = model.model_dir
-        self.model = model.model
-        self.device_id = 0
-        self.total_epoch = self.cfg.train.epoch
-        self.train_batch_size = self.cfg.train.batch_size
-        self.val_batch_size = self.cfg.evaluation.batch_size
-        self.save_dir = self.cfg.train.save_dir
-        init_dist(launcher='pytorch')
-        self.train_dataset = OFADataset(
-            file_path=self.cfg.dataset.train_set,
-            selected_id_keys=self.cfg.dataset.selected_id_keys,
-            preprocessor=OfaPreprocessor(
-                model_dir=self.model_dir, split=ModeKeys.TRAIN),
-        )
-        self.val_dataset = OFADataset(
-            file_path=self.cfg.dataset.valid_set,
-            selected_id_keys=self.cfg.dataset.selected_id_keys,
-            preprocessor=OfaPreprocessor(
-                model_dir=self.model_dir, split=ModeKeys.EVAL),
+        model_dir = model.model_dir
+        cfg_file = os.path.join(model_dir, ModelFile.CONFIGURATION)
+        cfg = Config.from_file(cfg_file)
+        dataset = load_dataset(
+            cfg.dataset.script,
+            data_files=cfg.dataset.hf_dataset,
+            sep=cfg.dataset.sep,
         )
-        epoch_steps = len(
-            self.train_dataset) // self.cfg.train.gradient_accumulation_steps
-        self.cfg.train.num_train_steps = epoch_steps * self.cfg.train.epoch
+        ms_dadaset = MsDataset.from_hf_dataset(dataset)
+        # train_dataset = OFADataset(
+        #     file_path=cfg.dataset.train_set,
+        #     selected_id_keys=cfg.dataset.selected_id_keys,
+        #     preprocessor=OfaPreprocessor(
+        #         model_dir=model_dir, mode=ModeKeys.TRAIN),
+        # )
+        # val_dataset = OFADataset(
+        #     file_path=cfg.dataset.valid_set,
+        #     selected_id_keys=cfg.dataset.selected_id_keys,
+        #     preprocessor=OfaPreprocessor(
+        #         model_dir=model_dir, mode=ModeKeys.EVAL),
+        # )
+        epoch_steps = len(ms_dadaset['train']) // (
+            cfg.train.gradient_accumulation_steps
+            * cfg.train.dataloader.batch_size_per_gpu)
+        cfg.train.lr_scheduler.num_train_steps = epoch_steps * cfg.train.max_epochs
+        cfg.train.criterion.tokenizer = model.tokenizer
         self.criterion = AdjustLabelSmoothedCrossEntropyCriterion(
-            self.cfg.train.criterion)
-
-    def train(self, *args, **kwargs):
-        assert dist.is_initialized()
-
-        self.model.train()
-        self.model.to(self.device_id)
-        ddp_model = torch.nn.parallel.DistributedDataParallel(
-            self.model, device_ids=[
-                self.device_id,
-            ])
-
-        optimizer = transformers.AdamW(
-            self.model.parameters(),
-            lr=self.cfg.train.lr,
-            weight_decay=self.cfg.train.weight_decay,
-            correct_bias=False,
-        )
-        scheduler_class, scheduler_args = get_schedule(self.cfg.train)
+            cfg.train.criterion)
+        optimizer = build_optimizer(model, cfg=cfg.train.optimizer)
+        scheduler_class, scheduler_args = get_schedule(cfg.train.lr_scheduler)
         if scheduler_class is not None:
             lr_scheduler = scheduler_class(**{'optimizer': optimizer},
                                            **scheduler_args)
         else:
             lr_scheduler = None
-        for epoch in range(self.total_epoch):
-            train_sampler = DistributedSampler(
-                dataset=self.train_dataset, shuffle=True)
-            train_sampler.set_epoch(epoch)
-
-            train_params = {
-                'pin_memory': True,
-                'collate_fn': collate_fn,
-                'batch_size': self.train_batch_size,
-                'shuffle': False,
-                'drop_last': True,
-                'sampler': train_sampler,
-                'num_workers': 2,
-            }
-
-            train_loader = DataLoader(self.train_dataset, **train_params)
+        super().__init__(
+            cfg_file=cfg_file,
+            model=model,
+            data_collator=collate_fn,
+            train_dataset=dataset['train'],
+            eval_dataset=dataset['valid'],
+            optimizers=(optimizer, lr_scheduler),
+            work_dir=cfg.train.work_dir,
+            *args,
+            **kwargs,
+        )
 
-            for idx, batch in enumerate(train_loader, start=1):
-                model_outputs = ddp_model(**batch)
-                loss, sample_size, logging_output = self.criterion(
-                    model_outputs, batch)
-                loss.backward()
-                optimizer.zero_grad()
-                if lr_scheduler is not None:
-                    lr_scheduler.step()
-                optimizer.step()
-                optimizer.zero_grad()
-                if idx % 10 == 0:
-                    logger.info(
-                        'epoch: {}, train batch {}/{}, loss={:.5f}'.format(
-                            epoch, idx, len(train_loader), loss.item()))
-            if dist.get_rank() == 0:
-                os.makedirs(self.ckpt_dir, exist_ok=True)
-                torch.save(ddp_model.module.state_dict(),
-                           f'{self.ckpt_dir}/epoch{epoch}.bin')
+    def train(self, *args, **kwargs):
+        pass
 
     def evaluate(self,
                  checkpoint_path: Optional[str] = None,
                  *args,
                  **kwargs) -> Dict[str, float]:
         pass
+
+    def prediction_step(self, model, inputs):
+        pass
diff --git a/modelscope/trainers/multi_modal/ofa/ofa_trainer_old.py b/modelscope/trainers/multi_modal/ofa/ofa_trainer_old.py
new file mode 100644
index 00000000..5e41b49b
--- /dev/null
+++ b/modelscope/trainers/multi_modal/ofa/ofa_trainer_old.py
@@ -0,0 +1,120 @@
+import os
+from os import path as osp
+from typing import Dict, Optional
+
+import torch
+import torch.distributed as dist
+import transformers
+from torch.utils.data import DataLoader
+from torch.utils.data.distributed import DistributedSampler
+
+from modelscope.metainfo import Trainers
+from modelscope.models.base import Model
+from modelscope.preprocessors.multi_modal import OfaPreprocessor
+from modelscope.preprocessors.ofa.utils.collate import collate_fn
+from modelscope.trainers.base import BaseTrainer
+from modelscope.trainers.builder import TRAINERS
+from modelscope.utils.constant import ModeKeys, ModelFile
+from modelscope.utils.logger import get_logger
+from modelscope.utils.torch_utils import init_dist
+from .ofa_trainer_utils import (AdjustLabelSmoothedCrossEntropyCriterion,
+                                OFADataset, get_schedule)
+
+logger = get_logger()
+
+
+@TRAINERS.register_module(module_name=Trainers.ofa_tasks)
+class OFAOldTrainer(BaseTrainer):
+
+    def __init__(self, model: str, *args, **kwargs):
+        model = Model.from_pretrained(model)
+        super().__init__(osp.join(model.model_dir, ModelFile.CONFIGURATION))
+        self.model_dir = model.model_dir
+        self.model = model.model
+        self.device_id = 0
+        self.total_epoch = self.cfg.train.epoch
+        self.train_batch_size = self.cfg.train.batch_size
+        self.val_batch_size = self.cfg.evaluation.batch_size
+        self.save_dir = self.cfg.train.save_dir
+        init_dist(launcher='pytorch')
+        self.train_dataset = OFADataset(
+            file_path=self.cfg.dataset.train_set,
+            selected_id_keys=self.cfg.dataset.selected_id_keys,
+            preprocessor=OfaPreprocessor(
+                model_dir=self.model_dir, split=ModeKeys.TRAIN),
+        )
+        self.val_dataset = OFADataset(
+            file_path=self.cfg.dataset.valid_set,
+            selected_id_keys=self.cfg.dataset.selected_id_keys,
+            preprocessor=OfaPreprocessor(
+                model_dir=self.model_dir, split=ModeKeys.EVAL),
+        )
+        epoch_steps = len(
+            self.train_dataset) // self.cfg.train.gradient_accumulation_steps
+        self.cfg.train.num_train_steps = epoch_steps * self.cfg.train.epoch
+        self.criterion = AdjustLabelSmoothedCrossEntropyCriterion(
+            self.cfg.train.criterion)
+
+    def train(self, *args, **kwargs):
+        assert dist.is_initialized()
+
+        self.model.train()
+        self.model.to(self.device_id)
+        ddp_model = torch.nn.parallel.DistributedDataParallel(
+            self.model, device_ids=[
+                self.device_id,
+            ])
+
+        optimizer = transformers.AdamW(
+            self.model.parameters(),
+            lr=self.cfg.train.lr,
+            weight_decay=self.cfg.train.weight_decay,
+            correct_bias=False,
+        )
+        scheduler_class, scheduler_args = get_schedule(self.cfg.train)
+        if scheduler_class is not None:
+            lr_scheduler = scheduler_class(**{'optimizer': optimizer},
+                                           **scheduler_args)
+        else:
+            lr_scheduler = None
+        for epoch in range(self.total_epoch):
+            train_sampler = DistributedSampler(
+                dataset=self.train_dataset, shuffle=True)
+            train_sampler.set_epoch(epoch)
+
+            train_params = {
+                'pin_memory': True,
+                'collate_fn': collate_fn,
+                'batch_size': self.train_batch_size,
+                'shuffle': False,
+                'drop_last': True,
+                'sampler': train_sampler,
+                'num_workers': 2,
+            }
+
+            train_loader = DataLoader(self.train_dataset, **train_params)
+
+            for idx, batch in enumerate(train_loader, start=1):
+                model_outputs = ddp_model(**batch)
+                loss, sample_size, logging_output = self.criterion(
+                    model_outputs, batch)
+                loss.backward()
+                optimizer.zero_grad()
+                if lr_scheduler is not None:
+                    lr_scheduler.step()
+                optimizer.step()
+                optimizer.zero_grad()
+                if idx % 10 == 0:
+                    logger.info(
+                        'epoch: {}, train batch {}/{}, loss={:.5f}'.format(
+                            epoch, idx, len(train_loader), loss.item()))
+            if dist.get_rank() == 0:
+                os.makedirs(self.ckpt_dir, exist_ok=True)
+                torch.save(ddp_model.module.state_dict(),
+                           f'{self.ckpt_dir}/epoch{epoch}.bin')
+
+    def evaluate(self,
+                 checkpoint_path: Optional[str] = None,
+                 *args,
+                 **kwargs) -> Dict[str, float]:
+        pass
diff --git a/modelscope/trainers/multi_modal/ofa/ofa_trainer_utils.py b/modelscope/trainers/multi_modal/ofa/ofa_trainer_utils.py
index 10acc870..38a13f4d 100644
--- a/modelscope/trainers/multi_modal/ofa/ofa_trainer_utils.py
+++ b/modelscope/trainers/multi_modal/ofa/ofa_trainer_utils.py
@@ -35,7 +35,7 @@ class OFADataset(Dataset):
 
         self.dataset = OFAFileDataset(
             file_path=file_path,
-            selected_col_ids=selected_col_ids,
+            selected_col_ids=','.join(selected_col_ids),
             dtypes=dtypes,
             separator=separator,
             cached_index=cached_index)
@@ -157,7 +157,7 @@ class AdjustLabelSmoothedCrossEntropyCriterion(_Loss):
 
         self.constraint_start = None
         self.constraint_end = None
-        if args.constraint_range is not None:
+        if args.constraint_range:
             constraint_start, constraint_end = args.constraint_range.split(',')
             self.constraint_start = int(constraint_start)
             self.constraint_end = int(constraint_end)
@@ -280,35 +280,39 @@ class AdjustLabelSmoothedCrossEntropyCriterion(_Loss):
         return loss, nll_loss, ntokens
 
 
-def get_schedule(args):
+def get_schedule(scheduler):
 
-    if args.schedule == 'const':
+    if scheduler.name == 'const':
         scheduler_class = transformers.get_constant_schedule_with_warmup
         scheduler_args = {
             'num_warmup_steps':
-            int(args.warmup_proportion * args.num_train_steps)
+            int(scheduler.warmup_proportion * scheduler.num_train_steps)
         }
-    elif args.schedule == 'linear':
+    elif scheduler.name == 'linear':
         scheduler_class = transformers.get_linear_schedule_with_warmup
         scheduler_args = {
             'num_warmup_steps':
-            int(args.warmup_proportion * args.num_train_steps),
-            'num_training_steps': args.num_train_steps
+            int(scheduler.warmup_proportion * scheduler.num_train_steps),
+            'num_training_steps':
+            scheduler.num_train_steps
         }
-    elif args.schedule == 'cosine':
+    elif scheduler.name == 'cosine':
         scheduler_class = transformers.get_cosine_schedule_with_warmup
         scheduler_args = {
             'num_warmup_steps':
-            int(args.warmup_proportion * args.num_train_steps),
-            'num_training_steps': args.num_train_steps
+            int(scheduler.warmup_proportion * scheduler.num_train_steps),
+            'num_training_steps':
+            scheduler.num_train_steps
         }
-    elif args.schedule == 'polynomial_decay':
+    elif scheduler.name == 'polynomial_decay':
         scheduler_class = transformers.get_polynomial_decay_schedule_with_warmup
         scheduler_args = {
             'num_warmup_steps':
-            int(args.warmup_proportion * args.num_train_steps),
-            'num_training_steps': args.num_train_steps,
-            'lr_end': args.lr_end
+            int(scheduler.warmup_proportion * scheduler.num_train_steps),
+            'num_training_steps':
+            scheduler.num_train_steps,
+            'lr_end':
+            scheduler.lr_end
         }
     else:
         raise NotImplementedError
diff --git a/tests/trainers/test_ofa_trainer.py b/tests/trainers/test_ofa_trainer.py
index bfec1b85..af0cf2dc 100644
--- a/tests/trainers/test_ofa_trainer.py
+++ b/tests/trainers/test_ofa_trainer.py
@@ -1,4 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import os
 import shutil
 import unittest
 
@@ -13,7 +14,8 @@ class TestOfaTrainer(unittest.TestCase):
         model_id = '/apsarapangu/disk2/yichang.zyc/ckpt/MaaS/ofa_text-classification_mnli_large_en'
         self.trainer = OFATrainer(model_id)
         self.trainer.train()
-        shutil.rmtree(self.trainer.save_dir)
+        if os.path.exists(self.trainer.work_dir):
+            shutil.rmtree(self.trainer.work_dir)
 
 
 if __name__ == '__main__':

From 32994aa5b4847c0d0c75bbfcbc2bcb2ed67721eb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=A1=8C=E5=97=94?= <yichang.zyc@alibaba-inc.com>
Date: Mon, 5 Sep 2022 11:39:32 +0800
Subject: [PATCH 04/54] for kangdi debug

---
 data/test/text/mnli/train.tsv                 | 101 ++++++++++++++++++
 data/test/text/mnli/valid.tsv                 |  11 ++
 .../models/multi_modal/ofa_for_all_tasks.py   |   2 -
 modelscope/preprocessors/multi_modal.py       |   8 +-
 modelscope/preprocessors/ofa/utils/collate.py |   7 +-
 .../trainers/multi_modal/ofa/ofa_trainer.py   |  36 +++++--
 modelscope/trainers/trainer.py                |   6 +-
 7 files changed, 153 insertions(+), 18 deletions(-)
 create mode 100644 data/test/text/mnli/train.tsv
 create mode 100644 data/test/text/mnli/valid.tsv

diff --git a/data/test/text/mnli/train.tsv b/data/test/text/mnli/train.tsv
new file mode 100644
index 00000000..83746457
--- /dev/null
+++ b/data/test/text/mnli/train.tsv
@@ -0,0 +1,101 @@
+sentence1	sentence2	label	sentence1_genre
+Alarm bells would not start ringing until these efforts-which could take five minutes or more-were tried and had failed.	Alarm bells would not start until efforts had failed.	1	nineeleven:Alarm bells would not start ringing until these efforts-which could take five minutes or more-were tried and had failed.
+In those countries where dialect study is undertaken, dialectologists observe that there are today many factors militating against the strict maintenance of older dialect  the standardization of terminology as adopted by national periodicals, news services, radio, and television; the establishment of  prestige  dialects and, through the media, their promulgation; and the huge population shifts that have taken place, particularly in the U.S. since WWII.	Outside of the U.S., this phenomenon is most prominently seen in the other countries involved in WWII.	0	verbatim:In those countries where dialect study is undertaken, dialectologists observe that there are today many factors militating against the strict maintenance of older dialect  the standardization of terminology as adopted by national periodicals, news services, radio, and television; the establishment of  prestige  dialects and, through the media, their promulgation; and the huge population shifts that have taken place, particularly in the U.S. since WWII.
+In the hands of parents and teachers lies the awesome responsibility of conveying to the next generation the intellectual, scientific, aesthetic, and moral achievements that dierentiate our species from others.	Parents have the responsibility to convey to the next generation the scientific achievements humans have made.	1	oup:In the hands of parents and teachers lies the awesome responsibility of conveying to the next generation the intellectual, scientific, aesthetic, and moral achievements that dierentiate our species from others.
+By 9:20, Indianapolis Center learned that there were other hijacked aircraft, and began to doubt its initial assumption that American 77 had crashed.	American 77 was confirmed to have crashed in an unrelated incident.	2	nineeleven:By 9:20, Indianapolis Center learned that there were other hijacked aircraft, and began to doubt its initial assumption that American 77 had crashed.
+How about making their publicity buyer-friendlier as well?	We need to have less of an input into publicity from buyers.	2	verbatim:How about making their publicity buyer-friendlier as well?
+He liked to do little league and soccer with him, and we did all the things families do.	He liked to engage in typical family activities with him, like soccer and little league.	1	letters:He liked to do little league and soccer with him, and we did all the things families do.
+Business units adopting both bar codes and EDI are therefore able to reduce the transaction costs for processing information about sales and orders.	Business units use either bar codes or EDI.	0	oup:Business units adopting both bar codes and EDI are therefore able to reduce the transaction costs for processing information about sales and orders.
+When bar codes and EDI are combined with advanced shipping practices, the benefit of each practice is enhanced; order processing occurs more rapidly, accurately, and with less paper.	Bar codes and EDI are synergistic with advanced shipping practice. 	1	oup:When bar codes and EDI are combined with advanced shipping practices, the benefit of each practice is enhanced; order processing occurs more rapidly, accurately, and with less paper.
+In all the following cases, the spelling, (apparent) roots, or sound of the word actively suggest a meaning different from the true one.	The real meaning of the word is separate to its roots, spelling, and sound.	1	verbatim:In all the following cases, the spelling, (apparent) roots, or sound of the word actively suggest a meaning different from the true one.
+In 2001, with Bin Ladin's help they re-formed into an organization called Ansar al Islam.	Bin Ladin helped reform a group called Ansar al Islam.	1	nineeleven:In 2001, with Bin Ladin's help they re-formed into an organization called Ansar al Islam.
+We are pleased to tell you of a very exciting development with the fund, which has reached a market value of $750,000.	The fund has a market value of $750,000 because we invested heavily in a ponzi scheme.	0	letters:We are pleased to tell you of a very exciting development with the fund, which has reached a market value of $750,000.
+Men say that, too, of course.	Women are the only ones who say that.	2	verbatim:Men say that, too, of course.
+The jagged heavy line in Figure 6.5 (page 100) depicts a typical inventory pattern for a replenishable product like our blue jeans in size 8. Note that the inventory level drops gradually as consumers purchase the item.	The clean straight line in Figure 6.5 illustrates the inventory pattern for replenishible products.	2	oup:The jagged heavy line in Figure 6.5 (page 100) depicts a typical inventory pattern for a replenishable product like our blue jeans in size 8. Note that the inventory level drops gradually as consumers purchase the item.
+Our 90th Birthday celebration began in July and will continue through February.	The celebration will include a promotion for sales lasting for the duration of the celebration.	0	letters:Our 90th Birthday celebration began in July and will continue through February.
+And, you know, with this, you know, it wasn't many opportunities for kids to be special, because kids weren't, you know, you were pushed out of adult conversation, and just really pushed to the side.	Kids were so very special, even being included in adult conversations and given multiple opportunities.	2	facetoface:And, you know, with this, you know, it wasn't many opportunities for kids to be special, because kids weren't, you know, you were pushed out of adult conversation, and just really pushed to the side.
+As a participant in the Chancellor's Circle or Chancellor's Associates, you will receive reports from Jerry Bepko on how he puts your gifts to work.	You will receive reports from Jerry as frequently as you request. 	0	letters:As a participant in the Chancellor's Circle or Chancellor's Associates, you will receive reports from Jerry Bepko on how he puts your gifts to work.
+Um, Christmas is coming up pretty soon huh?	It's soon going to be our Christmas party.	0	facetoface:Um, Christmas is coming up pretty soon huh?
+-The new Masters in Planning degree;	The Masters of Planning degree has been around for a very long time.	2	letters:-The new Masters in Planning degree;
+She responded by throwing down the block and turning to another activity.	She responded by abandoning the block, and engaging in another activity.	1	oup:She responded by throwing down the block and turning to another activity.
+Appreciate it.	I'm forever grateful.	0	nineeleven:Appreciate it.
+This book is a good introduction to the subject (in England); those familiar with dialectology in America, and those interested in the study in England or, indeed, generally would be well advised to add Word Maps to their libraries.	The book describes differences between American English and British English. 	0	verbatim:This book is a good introduction to the subject (in England); those familiar with dialectology in America, and those interested in the study in England or, indeed, generally would be well advised to add Word Maps to their libraries.
+One gets the impression that the editors of L used the good stuff from the W and substituted their own, much better material when they encountered some of the bad stuff.	The movie mashup editors were surprised how well the lifted L material meshed with their contributions. 	0	verbatim:One gets the impression that the editors of L used the good stuff from the W and substituted their own, much better material when they encountered some of the bad stuff.
+I hope you will take this opportunity to make a contribution to support SEND's homeownership work.	I hope you'll make a contribution to support the work SEND does.	1	letters:I hope you will take this opportunity to make a contribution to support SEND's homeownership work.
+By the 1990s, high birthrates and declining rates of infant mortality had produced a common problem throughout the Muslim  a large, steadily increasing population of young men without any reasonable expectation of suitable or steady employment-a sure prescription for social turbulence.	The Muslims have a high number of births.	1	nineeleven:By the 1990s, high birthrates and declining rates of infant mortality had produced a common problem throughout the Muslim  a large, steadily increasing population of young men without any reasonable expectation of suitable or steady employment-a sure prescription for social turbulence.
+FAA headquarters had by this time established an open line of communication with the Command Center at Herndon and instructed it to poll all its centers about suspect aircraft.	FAA headquarters refused to communicate with the Command Center at Herndon.	2	nineeleven:FAA headquarters had by this time established an open line of communication with the Command Center at Herndon and instructed it to poll all its centers about suspect aircraft.
+Hani Hanjour, assigned to seat 1B (first class), soon followed.	Hani Hanji was assigned to seat 1b most of the year.	0	nineeleven:Hani Hanjour, assigned to seat 1B (first class), soon followed.
+But of what use is a long entry on spoonerisms?	What what can this long entry do for us other than make us tired?	0	verbatim:But of what use is a long entry on spoonerisms?
+What we are able to accomplish each year is a direct result of your generosity and your understanding of what it takes to provide the best legal education we possibly can.	Your understanding has an effect on what we can accomplish. 	1	letters:What we are able to accomplish each year is a direct result of your generosity and your understanding of what it takes to provide the best legal education we possibly can.
+I want to know much of you.	I don't have much time so we have to talk about you now or never.	0	verbatim:I want to know much of you.
+I am pleased to tell you that we have had a positive response to the letter.	We have had a positive response to the letter because we include drugs in the envelope. 	0	letters:I am pleased to tell you that we have had a positive response to the letter.
+At eight or ten stitches an inch, it is possible to seam thirteen to sixteen or more inches a second.	Seaming between 13 and 15 inches per second is the ideal speed.	0	oup:At eight or ten stitches an inch, it is possible to seam thirteen to sixteen or more inches a second.
+An English authority on dictionaries, James Root Hulbert, says that The Concise Oxford is the best for literary use in Britain and Chambers the best  for general British use. 	The consise Oxford dictionary is the best one in all circumstances. 	2	verbatim:An English authority on dictionaries, James Root Hulbert, says that The Concise Oxford is the best for literary use in Britain and Chambers the best  for general British use.
+At 8:51, the controller noticed the transponder change from United 175 and tried to contact the aircraft.	The transponder code on United 175 changed and the controller tried contacting them.	1	nineeleven:At 8:51, the controller noticed the transponder change from United 175 and tried to contact the aircraft.
+Captain Victor Saracini and First Officer Michael Horrocks piloted the Boeing 767, which had seven flight attendants.	There were seven flight attendants aboard the Boeing 767.	1	nineeleven:Captain Victor Saracini and First Officer Michael Horrocks piloted the Boeing 767, which had seven flight attendants.
+Fulfillment of this goal requires full participation from members of the Indiana Dental Association.	In order to reach our goal we need full participation from members of the dental association.	1	letters:Fulfillment of this goal requires full participation from members of the Indiana Dental Association.
+We put the baby mallard in a small aviary with the half-grown muscovy, and it worked.	The mallard and the muscovy shared the aviary. 	1	letters:We put the baby mallard in a small aviary with the half-grown muscovy, and it worked.
+The President said he remembered such a conversation, and that it reminded him of when he had been an interceptor pilot.	The President said nothing about the conversation in question.	2	nineeleven:The President said he remembered such a conversation, and that it reminded him of when he had been an interceptor pilot.
+The information-integrated channels developed in the United States, which are now influencing sourcing patterns from Mexico and the Caribbean Basin, have begun to affect the textile and apparel sectors worldwide.	Information-integrated channels have also been adopted in Europe more recently.	0	oup:The information-integrated channels developed in the United States, which are now influencing sourcing patterns from Mexico and the Caribbean Basin, have begun to affect the textile and apparel sectors worldwide.
+The average tuition for a one-day C.E. course is about $125.	The average tuition for a one-day C.E. course is over $100, but for an extra $50 you get the textbook included. 	0	letters:The average tuition for a one-day C.E. course is about $125.
+However, these are difficult times for public institutions of higher education, because legislative appropriations are either flat or in the decline.	At the moment higher education institutions are thriving 	2	letters:However, these are difficult times for public institutions of higher education, because legislative appropriations are either flat or in the decline.
+For example, James Garner's Rockford dubbed as a Japanese tenor is a reminder of one's firm awareness of Garner's American tone and timbre.	James Garner's Rockford dubbed as a Spanish tenor is quite impressive.	2	verbatim:For example, James Garner's Rockford dubbed as a Japanese tenor is a reminder of one's firm awareness of Garner's American tone and timbre.
+He worked, he's a teacher, and at that time he worked as the principal of that school, of that school, because it was a, like a high school, there was, from first (grade) to high school.	The man is a stripper, and a damn good one at that.	2	facetoface:He worked, he's a teacher, and at that time he worked as the principal of that school, of that school, because it was a, like a high school, there was, from first (grade) to high school.
+Uh, my mom took me for a it, um, doctor's visit uh, it was a physical.	My mom took me to the doctors for a physical.	1	facetoface:Uh, my mom took me for a it, um, doctor's visit uh, it was a physical.
+The forecasting and inventory models presented in this chapter are not new; they have been recommended for years by statisticians and operations researchers.	The inventory operations presented in this chapter all take a lot of time to implement.	0	oup:The forecasting and inventory models presented in this chapter are not new; they have been recommended for years by statisticians and operations researchers.
+Gifts of $40.00 add up to provide valuable funding.	Valuable funding can be made up of gifts of $40.00.	1	letters:Gifts of $40.00 add up to provide valuable funding.
+The mission of the Social Health Association of Central Indiana is to promote healthy behavior and responsible relationships through sexuality education and life skills training.	 Social Health Association of Central Indiana wants to promote healthy behaviors through sex ed and life skills training.	1	letters:The mission of the Social Health Association of Central Indiana is to promote healthy behavior and responsible relationships through sexuality education and life skills training.
+To begin with, the adoption of bar codes came before rapid replenishment arrangements because retailers required a low-cost means of collecting information at the detailed product level for their own use'that is, they first developed an efficient method for scanning prices at the check-out register and tracking products for internal inventory purposes.	There are several cheap methods for retailer information collection, but bar codes are the best.	0	oup:To begin with, the adoption of bar codes came before rapid replenishment arrangements because retailers required a low-cost means of collecting information at the detailed product level for their own use'that is, they first developed an efficient method for scanning prices at the check-out register and tracking products for internal inventory purposes.
+From that point of view the differing interpretations Mr. Anson and I read into the passage are of secondary importance.	Mr. Anson was an expert at political interpretations.	0	verbatim:From that point of view the differing interpretations Mr. Anson and I read into the passage are of secondary importance.
+But we know that at 10:31, General Larry Arnold instructed his staff to broadcast the following over a NORAD instant messaging  10:31 Vice president has cleared to us to intercept tracks of interest and shoot them down if they do not respond per [General Arnold].	General Larry Arnold told his staff to broadcast over a NORAD messaging service at 10:31 that the Vice president had authorized the shooting down of hijacked planes, they did so immediately.	0	nineeleven:But we know that at 10:31, General Larry Arnold instructed his staff to broadcast the following over a NORAD instant messaging  10:31 Vice president has cleared to us to intercept tracks of interest and shoot them down if they do not respond per [General Arnold].
+We are leaders in the bar, business, government, and community affairs.	We are the dregs of the community affairs and we know it.	2	letters:We are leaders in the bar, business, government, and community affairs.
+He died in a ferryboat accident on Lake Victoria just a few days after Bin Ladin arrived in Jalalabad, leaving Bin Ladin with a need to replace him not only in the Shura but also as supervisor of the cells and prospective operations in East Africa.	After his untimely death, Bin Ladin was forced to replace his roles in the Shura and in supervising cells in East Africa.	1	nineeleven:He died in a ferryboat accident on Lake Victoria just a few days after Bin Ladin arrived in Jalalabad, leaving Bin Ladin with a need to replace him not only in the Shura but also as supervisor of the cells and prospective operations in East Africa.
+Letters in support or condemnation of the QES program (though one may assume they will insist on programme ) should be addressed to Mrs Anne Shelley, Secretary, Queen's English Society, 3 Manor Crescent, Guildford GU2 6NF, England.	Mrs. Anne Shelley is in charge of the QES program.	2	verbatim:Letters in support or condemnation of the QES program (though one may assume they will insist on programme ) should be addressed to Mrs Anne Shelley, Secretary, Queen's English Society, 3 Manor Crescent, Guildford GU2 6NF, England.
+This was done because of the organization of work in clothing shops; the low capital costs and high proportion of labor costs, especially in women's wear for contract shops; the intense product competition among manufacturers within and among geographic markets; and the diversity of products and changing styles.	This was done because of how workers in clothing shops were organized according to experience.	0	oup:This was done because of the organization of work in clothing shops; the low capital costs and high proportion of labor costs, especially in women's wear for contract shops; the intense product competition among manufacturers within and among geographic markets; and the diversity of products and changing styles.
+Cancel, and tear to pieces, that great bond Which keeps me pale!	Remove and destroy the thing that keeps me pale!	1	verbatim:Cancel, and tear to pieces, that great bond Which keeps me pale!
+Between 8:25 and 8:32, in accordance with the FAA protocol, Boston Center managers started notifying their chain of command that American 11 had been hijacked.	it was not until 9:00 that Boston Center messengers realized that American 11 had been hijacked.	2	nineeleven:Between 8:25 and 8:32, in accordance with the FAA protocol, Boston Center managers started notifying their chain of command that American 11 had been hijacked.
+I love it!	I hate it. 	2	facetoface:I love it!
+Instead, in a number of cases their rulers sought to buy off local Islamist movements by ceding control of many social and educational issues.	This is why so much violence has been directed away from their native countries.	0	nineeleven:Instead, in a number of cases their rulers sought to buy off local Islamist movements by ceding control of many social and educational issues.
+The time saved in production can be lost if the distribution method is slow, or if there are other impediments to the movement of products from the apparel-maker to the retailer.	The shortened production time would be wasted if the distribution is slow.	1	oup:The time saved in production can be lost if the distribution method is slow, or if there are other impediments to the movement of products from the apparel-maker to the retailer.
+Periodically, the Islamic world has seen surges of what, for want of a better term, is often labeled fundamentalism.	Fundamentalism periodically surfaces in Islamic countries.	1	nineeleven:Periodically, the Islamic world has seen surges of what, for want of a better term, is often labeled fundamentalism.
+He told us that by the time he arrived, the order had already been passed down NORAD's chain of command.	He told us that the order had been sent down from the FAA.	2	nineeleven:He told us that by the time he arrived, the order had already been passed down NORAD's chain of command.
+But uh, we hear a lot about home and how it used to be and like I said walking five miles to school and--	We like to know what the old days are like.	0	facetoface:But uh, we hear a lot about home and how it used to be and like I said walking five miles to school and--
+noisome Has nothing to do with sound or decibel level, but means simply unpleasant or disgusting.	Noisome had something to do with the lights, however.	0	verbatim:noisome Has nothing to do with sound or decibel level, but means simply unpleasant or disgusting.
+However, it didn't seem to be so horrifying.	It did not seem to be so scary.	1	facetoface:However, it didn't seem to be so horrifying.
+ Hold on a second.	Wait a second.	1	nineeleven: Hold on a second.
+ What did it look like?	What did it look like to you?	1	oup: What did it look like?
+Mass customization of this sort also means that a single garment must pass through the sewing room at a time.	The complex customization of the garment requires every worker's attention.	0	oup:Mass customization of this sort also means that a single garment must pass through the sewing room at a time.
+Cobuild and CED are far from being such polar opposites, but they exemplify this general point.	Cobuild and CED are polar opposites, no matter which way you look at it.	2	verbatim:Cobuild and CED are far from being such polar opposites, but they exemplify this general point.
+The flight did not respond.	The flight responded.  	2	nineeleven:The flight did not respond.
+Here we will show how a decision tool can be used to make the transition from general intuition to specific decisions about (1) which products to make in each plant and (2) how to schedule the time and quantity of production for each product.	Here we are going to show how a decision tool can be useful in making the transition from intuition to specific decision.	1	oup:Here we will show how a decision tool can be used to make the transition from general intuition to specific decisions about (1) which products to make in each plant and (2) how to schedule the time and quantity of production for each product.
+The air defense of America began with this call.	America's air defense had already begun before the call was made.	2	nineeleven:The air defense of America began with this call.
+[I]mmigrants are cheap and controllable.	German immigrants are easy to afford and control.	0	oup:[I]mmigrants are cheap and controllable.
+(It should be noted that Johnson made the same kind of adaptation of another poem by Juvenal, Satire X, calling it The Vanity of Human Wishes.	Johnson made no adaptations to poems by Juvenal.	2	verbatim:(It should be noted that Johnson made the same kind of adaptation of another poem by Juvenal, Satire X, calling it The Vanity of Human Wishes.
+Callers reported that a passenger had been stabbed and that two people were lying on the floor of the cabin, injured or dead-possibly the captain and first officer.	No one called from the airplane at all.  	2	nineeleven:Callers reported that a passenger had been stabbed and that two people were lying on the floor of the cabin, injured or dead-possibly the captain and first officer.
+Many Americans have wondered, Why do 'they' hate us?	Americans wonder why they hate them.	0	nineeleven:Many Americans have wondered, Why do 'they' hate us?
+It is my (wholly unsubstantiated) guess that the majority of Soviet personnel in Vietnam are in fact not Russian.	It is likely that most of them are from other places	0	verbatim:It is my (wholly unsubstantiated) guess that the majority of Soviet personnel in Vietnam are in fact not Russian.
+We thought it would be cool to see just how far a BB would shoot.	We didn't have a BB gun.	2	facetoface:We thought it would be cool to see just how far a BB would shoot.
+For Indianapolis, that public university must be IUPUI.	The IUPUI university is the only public university in town.	0	letters:For Indianapolis, that public university must be IUPUI.
+Hopefully, all of us can do more internal marketing with our young patients to encourage them to consider the field of Dental Assisting.	No one should be encouraged to consider being a dental assistant. 	2	letters:Hopefully, all of us can do more internal marketing with our young patients to encourage them to consider the field of Dental Assisting.
+NEADS decided to keep the Otis fighters over New York.	The fighters were on guard to destroy any airplane.	0	nineeleven:NEADS decided to keep the Otis fighters over New York.
+He trained in desktop publishing and combined his enthusiastic work ethic with new-found skills in a burgeoning industry. 	This person learned about publishing.	1	letters:He trained in desktop publishing and combined his enthusiastic work ethic with new-found skills in a burgeoning industry.
+Who would have been telling you those stories?	Who did you tell those stories to?	2	facetoface:Who would have been telling you those stories?
+So, I have my sister's kid here and I'm going to kill him underneath this vehicle shortly.	My sister does not have a child. 	2	facetoface:So, I have my sister's kid here and I'm going to kill him underneath this vehicle shortly.
+According to one report, Saddam Hussein's efforts at this time to rebuild relations with the Saudis and other Middle Eastern regimes led him to stay clear of Bin Ladin.	It was because of his time spent making up for past actions that Saddam Hussein did not come in contact with Bin Laden. 	0	nineeleven:According to one report, Saddam Hussein's efforts at this time to rebuild relations with the Saudis and other Middle Eastern regimes led him to stay clear of Bin Ladin.
+He motioned to me as if they were going to cut off his head.	He made a rude gesture at me with one of his fingers.	2	facetoface:He motioned to me as if they were going to cut off his head.
+It is not at once apparent why the book is styled an  almanac,  but that is  there is no other book I know of that contains as much diverse information about American writers as this one.	The book is extremely thorough and well written. 	1	verbatim:It is not at once apparent why the book is styled an  almanac,  but that is  there is no other book I know of that contains as much diverse information about American writers as this one.
+Here the answer is a definite yes.	The answer is yes.	1	oup:Here the answer is a definite yes.
+Today, Bodenheim's novel might be of interest to students of the English language because of its use of slang.	Bodenheim's novel might be of interest to students of French Cuisine because of its use of recipes.	2	verbatim:Today, Bodenheim's novel might be of interest to students of the English language because of its use of slang.
+Each week's demand has been divided by the average demand over the twenty-four weeks; therefore, the average weekly demand is simply equal to 1.0 on this normalized scale.	Weekly demand is divided by the twenty four week average.	1	oup:Each week's demand has been divided by the average demand over the twenty-four weeks; therefore, the average weekly demand is simply equal to 1.0 on this normalized scale.
+Even though I had freedom when I was, you know, home, whatever, but I still had a curfew.	I had freedom when I was home, and there was no curfew, yahoo!	2	facetoface:Even though I had freedom when I was, you know, home, whatever, but I still had a curfew.
+Thus,  Step down (or back) and give me a shot  was readily understood.	Therefore, statements referring to giving me a shot were comprehended.	1	verbatim:Thus,  Step down (or back) and give me a shot  was readily understood.
+For Indianapolis, that public university must be IUPUI.	IUPUI is in the city of Chicago.	2	letters:For Indianapolis, that public university must be IUPUI.
+I'm sending this follow-up letter to let you know that your support is greatly needed and appreciated by everyone involved with graduate Endodontics at IU.	I have sent you 50 letters before this follow-up letter because you refuse to answer any other letters.	0	letters:I'm sending this follow-up letter to let you know that your support is greatly needed and appreciated by everyone involved with graduate Endodontics at IU.
+The Herron School of Art and Gallery of Indiana University is contemporary art!	The gallery at the Herron school displays contemporary art.	1	letters:The Herron School of Art and Gallery of Indiana University is contemporary art!
+So, I'm kind of like the hope, I guess.	I suppose I'm the hope, or something.	1	facetoface:So, I'm kind of like the hope, I guess.
+Please donate today.	Our website is down please come back tomorrow to make a donation.	2	letters:Please donate today.
+Do you watch that?	Can you see?	2	facetoface:Do you watch that?
+To a Western ear, the most predictable of language traits, perhaps, is the well-advertised Japanese use of r for our l .  Indeed, in my travels about Honshu during a three-month visit, I did hear  coinrocker,   see you rater,   Adurt Graphics  (dirty books),  blackwrrants  (hit-and-miss rendering of black walnuts),  Coffee Corombia  (a chain of coffee shops), and  Coconut Glove. 	To the Western ear, the least predictable of language traits are perhaps the most well-advertised use of r.	2	verbatim:To a Western ear, the most predictable of language traits, perhaps, is the well-advertised Japanese use of r for our l .  Indeed, in my travels about Honshu during a three-month visit, I did hear  coinrocker,   see you rater,   Adurt Graphics  (dirty books),  blackwrrants  (hit-and-miss rendering of black walnuts),  Coffee Corombia  (a chain of coffee shops), and  Coconut Glove.
+The recorder captured the sounds of loud thumps, crashes, shouts, and breaking glasses and plates.	The recorder didn't capture any of the sounds.	2	nineeleven:The recorder captured the sounds of loud thumps, crashes, shouts, and breaking glasses and plates.
+That's a good attitude!	You feel good about this, don't you? 	0	facetoface:That's a good attitude!
+Bloomer (for `flower'), butter (for `ram'), or even flower (for `river') are recurrent examples, but solvers must always be on the alert for new traps of this 	Bloomer is another word for flower, butter is for ram and flower for river. 	1	verbatim:Bloomer (for `flower'), butter (for `ram'), or even flower (for `river') are recurrent examples, but solvers must always be on the alert for new traps of this
diff --git a/data/test/text/mnli/valid.tsv b/data/test/text/mnli/valid.tsv
new file mode 100644
index 00000000..dd720865
--- /dev/null
+++ b/data/test/text/mnli/valid.tsv
@@ -0,0 +1,11 @@
+sentence1	sentence2	label	sentence1_genre
+The new rights are nice enough	Everyone really likes the newest benefits 	0	slate:The new rights are nice enough
+This site includes a list of all award winners and a searchable database of Government Executive articles.	The Government Executive articles housed on the website are not able to be searched.	2	government:This site includes a list of all award winners and a searchable database of Government Executive articles.
+uh i don't know i i have mixed emotions about him uh sometimes i like him but at the same times i love to see somebody beat him	I like him for the most part, but would still enjoy seeing someone beat him.	1	telephone:uh i don't know i i have mixed emotions about him uh sometimes i like him but at the same times i love to see somebody beat him
+yeah i i think my favorite restaurant is always been the one closest  you know the closest as long as it's it meets the minimum criteria you know of good food	My favorite restaurants are always at least a hundred miles away from my house. 	2	telephone:yeah i i think my favorite restaurant is always been the one closest  you know the closest as long as it's it meets the minimum criteria you know of good food
+i don't know um do you do a lot of camping	I know exactly.	2	telephone:i don't know um do you do a lot of camping
+well that would be a help i wish they would do that here we have got so little landfill space left that we're going to run out before the end of this decade and it's really going to be	We have plenty of space in the landfill.	2	telephone:well that would be a help i wish they would do that here we have got so little landfill space left that we're going to run out before the end of this decade and it's really going to be
+yeah i know and i did that all through college and it worked too	I did that all through college but it never worked 	2	telephone:yeah i know and i did that all through college and it worked too
+Calcutta seems to be the only other production center having any pretensions to artistic creativity at all, but ironically you're actually more likely to see the works of Satyajit Ray or Mrinal Sen shown in Europe or North America than in India itself.	Most of Mrinal Sen's work can be found in European collections.	0	travel:Calcutta seems to be the only other production center having any pretensions to artistic creativity at all, but ironically you're actually more likely to see the works of Satyajit Ray or Mrinal Sen shown in Europe or North America than in India itself.
+If that investor were willing to pay extra for the security of limited downside, she could buy put options with a strike price of $98, which would lock in her profit on the shares at $18, less whatever the options cost.	THe strike price could be $8.	2	slate:If that investor were willing to pay extra for the security of limited downside, she could buy put options with a strike price of $98, which would lock in her profit on the shares at $18, less whatever the options cost.
+3)  Dare you rise to the occasion, like Raskolnikov, and reject the petty rules that govern lesser men?	Would you rise up and defeaat all evil lords in the town?	0	slate:3)  Dare you rise to the occasion, like Raskolnikov, and reject the petty rules that govern lesser men?
diff --git a/modelscope/models/multi_modal/ofa_for_all_tasks.py b/modelscope/models/multi_modal/ofa_for_all_tasks.py
index 4528a9da..9583f86f 100644
--- a/modelscope/models/multi_modal/ofa_for_all_tasks.py
+++ b/modelscope/models/multi_modal/ofa_for_all_tasks.py
@@ -127,8 +127,6 @@ class OfaForAllTasks(TorchModel):
         return input
 
     def _text_gen_inference(self, input):
-        import pdb
-        pdb.set_trace()
         input = move_to_device(input, self._device)
         if 'prefix_tokens' in input:
             gen_output = self.generator.generate(
diff --git a/modelscope/preprocessors/multi_modal.py b/modelscope/preprocessors/multi_modal.py
index 17d61ae3..930f374b 100644
--- a/modelscope/preprocessors/multi_modal.py
+++ b/modelscope/preprocessors/multi_modal.py
@@ -68,6 +68,10 @@ class OfaPreprocessor(Preprocessor):
             cfg=self.cfg, model_dir=model_dir, mode=mode)
         self.keys = input_key_mapping[self.cfg.task]
         self.tokenizer = self.preprocess.tokenizer
+        if kwargs.get('no_collate', None):
+            self.no_collate = True
+        else:
+            self.no_collate = False
 
     # just for modelscope demo
     def _build_dict(self, input: Union[Input, List[Input]]) -> Dict[str, Any]:
@@ -98,7 +102,9 @@ class OfaPreprocessor(Preprocessor):
         for k, v in data.items():
             str_data[k] = str(v)
         sample['sample'] = str_data
-        if kwargs.get('no_collate', None):
+        # import pdb
+        # pdb.set_trace()
+        if self.no_collate:
             return sample
         else:
             return collate_fn([sample],
diff --git a/modelscope/preprocessors/ofa/utils/collate.py b/modelscope/preprocessors/ofa/utils/collate.py
index 82258e8b..7c17c23b 100644
--- a/modelscope/preprocessors/ofa/utils/collate.py
+++ b/modelscope/preprocessors/ofa/utils/collate.py
@@ -50,11 +50,10 @@ def collate_fn(samples, pad_idx, eos_idx):
     if samples[0].get('constraint_mask', None) is not None:
         batch['constraint_masks'] = merge('constraint_mask')
     if samples[0].get('decoder_prompt', None) is not None:
-        batch['decoder_prompts'] = torch.stack(
-            [s['decoder_prompt'] for s in samples], dim=0)
+        batch['decoder_prompts'] = np.array(
+            [s['decoder_prompt'].tolist() for s in samples])
     if samples[0].get('prefix_token', None) is not None:
-        batch['prefix_tokens'] = torch.stack(
-            [s['prefix_token'] for s in samples], dim=0)
+        batch['prefix_tokens'] = merge('prefix_token')
     # For detection and visual grounding
     if samples[0].get('w_resize_ratio', None) is not None:
         batch['w_resize_ratios'] = torch.stack(
diff --git a/modelscope/trainers/multi_modal/ofa/ofa_trainer.py b/modelscope/trainers/multi_modal/ofa/ofa_trainer.py
index fae79a74..0c33118e 100644
--- a/modelscope/trainers/multi_modal/ofa/ofa_trainer.py
+++ b/modelscope/trainers/multi_modal/ofa/ofa_trainer.py
@@ -1,4 +1,5 @@
 import os
+from functools import partial
 from typing import Dict, Optional
 
 from datasets import load_dataset
@@ -12,7 +13,7 @@ from modelscope.trainers import EpochBasedTrainer
 from modelscope.trainers.builder import TRAINERS
 from modelscope.trainers.optimizer.builder import build_optimizer
 from modelscope.utils.config import Config
-from modelscope.utils.constant import ModeKeys, ModelFile
+from modelscope.utils.constant import ConfigKeys, ModeKeys, ModelFile
 from .ofa_trainer_utils import (AdjustLabelSmoothedCrossEntropyCriterion,
                                 OFADataset, get_schedule)
 
@@ -21,8 +22,6 @@ from .ofa_trainer_utils import (AdjustLabelSmoothedCrossEntropyCriterion,
 class OFATrainer(EpochBasedTrainer):
 
     def __init__(self, model: str, *args, **kwargs):
-        # import pdb
-        # pdb.set_trace()
         model = Model.from_pretrained(model)
         model_dir = model.model_dir
         cfg_file = os.path.join(model_dir, ModelFile.CONFIGURATION)
@@ -32,7 +31,22 @@ class OFATrainer(EpochBasedTrainer):
             data_files=cfg.dataset.hf_dataset,
             sep=cfg.dataset.sep,
         )
-        ms_dadaset = MsDataset.from_hf_dataset(dataset)
+        dataset = MsDataset.from_hf_dataset(
+            dataset.rename_columns(cfg.dataset.column_map))
+        preprocessor = {
+            ConfigKeys.train:
+            OfaPreprocessor(
+                model_dir=model_dir, model=ModeKeys.TRAIN, no_collate=True),
+            ConfigKeys.val:
+            OfaPreprocessor(
+                model_dir=model_dir, model=ModeKeys.EVAL, no_collate=True),
+        }
+        # train_dataset = dataset['train'].to_torch_dataset(
+        #     preprocessors=OfaPreprocessor(model_dir=model_dir, model=ModeKeys.TRAIN, no_collate=True),
+        # )
+        # valid_dataset = dataset['valid'].to_torch_dataset(
+        #     preprocessors=OfaPreprocessor(model_dir=model_dir, model=ModeKeys.TRAIN, no_collate=True),
+        # )
         # train_dataset = OFADataset(
         #     file_path=cfg.dataset.train_set,
         #     selected_id_keys=cfg.dataset.selected_id_keys,
@@ -45,7 +59,7 @@ class OFATrainer(EpochBasedTrainer):
         #     preprocessor=OfaPreprocessor(
         #         model_dir=model_dir, mode=ModeKeys.EVAL),
         # )
-        epoch_steps = len(ms_dadaset['train']) // (
+        epoch_steps = len(dataset['train']) // (
             cfg.train.gradient_accumulation_steps
             * cfg.train.dataloader.batch_size_per_gpu)
         cfg.train.lr_scheduler.num_train_steps = epoch_steps * cfg.train.max_epochs
@@ -59,20 +73,26 @@ class OFATrainer(EpochBasedTrainer):
                                            **scheduler_args)
         else:
             lr_scheduler = None
+        collator = partial(
+            collate_fn,
+            pad_idx=model.tokenizer.pad_token_id,
+            eos_idx=model.tokenizer.eos_token_id,
+        )
         super().__init__(
             cfg_file=cfg_file,
             model=model,
-            data_collator=collate_fn,
+            data_collator=collator,
             train_dataset=dataset['train'],
             eval_dataset=dataset['valid'],
+            preprocessor=preprocessor,
             optimizers=(optimizer, lr_scheduler),
             work_dir=cfg.train.work_dir,
             *args,
             **kwargs,
         )
 
-    def train(self, *args, **kwargs):
-        pass
+    # def train(self, *args, **kwargs):
+    #     pass
 
     def evaluate(self,
                  checkpoint_path: Optional[str] = None,
diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py
index 614b728a..62378997 100644
--- a/modelscope/trainers/trainer.py
+++ b/modelscope/trainers/trainer.py
@@ -155,11 +155,12 @@ class EpochBasedTrainer(BaseTrainer):
         device_name = kwargs.get('device', 'gpu')
         verify_device(device_name)
         self.device = create_device(device_name)
-
         self.train_dataset = self.to_task_dataset(
             train_dataset,
             mode=ModeKeys.TRAIN,
             preprocessor=self.train_preprocessor)
+        # import pdb
+        # pdb.set_trace()
         self.eval_dataset = self.to_task_dataset(
             eval_dataset,
             mode=ModeKeys.EVAL,
@@ -426,7 +427,6 @@ class EpochBasedTrainer(BaseTrainer):
 
         self.register_optimizers_hook()
         self.register_hook_from_cfg(self.cfg.train.hooks)
-
         self.train_loop(self.train_dataloader)
 
     def evaluate(self, checkpoint_path=None):
@@ -626,7 +626,7 @@ class EpochBasedTrainer(BaseTrainer):
             torch_dataset = dataset.to_torch_dataset(
                 task_data_config=cfg,
                 task_name=self.cfg.task,
-                preprocessors=self.preprocessor)
+                preprocessors=preprocessor)
         else:
             torch_dataset = build_task_dataset(data_cfg, self.cfg.task)
         dataset = self.to_task_dataset(torch_dataset, mode)

From 5d83f62312bd101df965d07ead96070105c75e93 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=A1=8C=E5=97=94?= <yichang.zyc@alibaba-inc.com>
Date: Wed, 21 Sep 2022 15:52:10 +0800
Subject: [PATCH 05/54] mnli finetune done

---
 modelscope/metrics/accuracy_metric.py         |  44 +++++++
 .../ofa/generate/sequence_generator.py        |   6 +-
 .../models/multi_modal/ofa/utils/__init__.py  |   1 +
 .../models/multi_modal/ofa_for_all_tasks.py   |  77 ++++++-----
 modelscope/msdatasets/ms_dataset.py           |   7 +-
 .../cv/image_classification_pipeline.py       |   2 +
 modelscope/preprocessors/multi_modal.py       |   9 +-
 modelscope/preprocessors/ofa/base.py          |  33 +++--
 .../preprocessors/ofa/image_captioning.py     |  17 ++-
 .../preprocessors/ofa/image_classification.py |   2 +-
 modelscope/preprocessors/ofa/summarization.py |   2 +-
 .../preprocessors/ofa/text_classification.py  |  50 ++++++--
 .../ofa/text_to_image_synthesis.py            |   2 +-
 modelscope/preprocessors/ofa/utils/collate.py |   2 +
 .../preprocessors/ofa/visual_entailment.py    |   2 +-
 .../preprocessors/ofa/visual_grounding.py     |   2 +-
 .../ofa/visual_question_answering.py          |   2 +-
 .../trainers/multi_modal/ofa/ofa_trainer.py   |  68 +++++-----
 .../multi_modal/ofa/ofa_trainer_old.py        | 120 ------------------
 .../multi_modal/ofa/ofa_trainer_utils.py      |  38 +-----
 modelscope/trainers/trainer.py                |  18 +--
 modelscope/trainers/utils/inference.py        |  44 ++++---
 modelscope/utils/device.py                    |  16 ++-
 modelscope/utils/multi_modal/forked_pdb.py    |  17 +++
 tests/pipelines/test_ofa_tasks.py             |  21 +++
 tests/trainers/test_ofa_trainer.py            |   1 +
 26 files changed, 322 insertions(+), 281 deletions(-)
 create mode 100644 modelscope/metrics/accuracy_metric.py
 delete mode 100644 modelscope/trainers/multi_modal/ofa/ofa_trainer_old.py
 create mode 100644 modelscope/utils/multi_modal/forked_pdb.py

diff --git a/modelscope/metrics/accuracy_metric.py b/modelscope/metrics/accuracy_metric.py
new file mode 100644
index 00000000..0f73ce64
--- /dev/null
+++ b/modelscope/metrics/accuracy_metric.py
@@ -0,0 +1,44 @@
+from typing import Dict
+
+import numpy as np
+
+from modelscope.metainfo import Metrics
+from modelscope.outputs import OutputKeys
+from modelscope.utils.registry import default_group
+from .base import Metric
+from .builder import METRICS, MetricKeys
+
+
+@METRICS.register_module(group_key=default_group, module_name=Metrics.accuracy)
+class AccuracyMetric(Metric):
+    """The metric computation class for sequence classification classes.
+
+    This metric class calculates accuracy for the whole input batches.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.preds = []
+        self.labels = []
+
+    def add(self, outputs: Dict, inputs: Dict):
+        label_name = OutputKeys.LABEL if OutputKeys.LABEL in inputs else OutputKeys.LABELS
+        ground_truths = inputs[label_name]
+        eval_results = outputs[label_name]
+        assert type(ground_truths) == type(eval_results)
+        if isinstance(ground_truths, list):
+            self.preds.extend(eval_results)
+            self.labels.extend(ground_truths)
+        elif isinstance(ground_truths, np.ndarray):
+            self.preds.extend(eval_results.tolist())
+            self.labels.extend(ground_truths.tolist())
+        else:
+            raise 'only support list or np.ndarray'
+
+    def evaluate(self):
+        assert len(self.preds) == len(self.labels)
+        return {
+            MetricKeys.ACCURACY: (np.asarray([
+                pred == ref for pred, ref in zip(self.preds, self.labels)
+            ])).mean().item()
+        }
diff --git a/modelscope/models/multi_modal/ofa/generate/sequence_generator.py b/modelscope/models/multi_modal/ofa/generate/sequence_generator.py
index 15d19e2c..e42d3c8e 100644
--- a/modelscope/models/multi_modal/ofa/generate/sequence_generator.py
+++ b/modelscope/models/multi_modal/ofa/generate/sequence_generator.py
@@ -409,10 +409,12 @@ class SequenceGenerator(nn.Module):
                     out_prefix = p_toks_len_beam < (
                         step + no_repeat_ngram_size - 1)
                 else:
-                    out_prefix = [True] * bsz * beam_size
+                    out_prefix = torch.ones(bsz * beam_size).bool()
                 ngram_blocker_tokens = tokens[out_prefix]
                 ngram_blocker_lprobs = lprobs[out_prefix]
-                ngram_blocker_bsz = out_prefix.sum() // beam_size
+                ngram_blocker_bsz = torch.div(
+                    out_prefix.sum(), beam_size, rounding_mode='trunc')
+
                 lprobs[out_prefix] = self.repeat_ngram_blocker(
                     tokens=ngram_blocker_tokens,
                     lprobs=ngram_blocker_lprobs,
diff --git a/modelscope/models/multi_modal/ofa/utils/__init__.py b/modelscope/models/multi_modal/ofa/utils/__init__.py
index e69de29b..f515818c 100644
--- a/modelscope/models/multi_modal/ofa/utils/__init__.py
+++ b/modelscope/models/multi_modal/ofa/utils/__init__.py
@@ -0,0 +1 @@
+from .constant import OFA_TASK_KEY_MAPPING
diff --git a/modelscope/models/multi_modal/ofa_for_all_tasks.py b/modelscope/models/multi_modal/ofa_for_all_tasks.py
index 9583f86f..cb8d3826 100644
--- a/modelscope/models/multi_modal/ofa_for_all_tasks.py
+++ b/modelscope/models/multi_modal/ofa_for_all_tasks.py
@@ -10,7 +10,6 @@ import torch.nn.functional as F
 
 from modelscope.metainfo import Models
 from modelscope.models import TorchModel
-from modelscope.models.base import Tensor
 from modelscope.models.builder import MODELS
 from modelscope.outputs import OutputKeys
 from modelscope.preprocessors.ofa.utils.collate import collate_tokens
@@ -38,7 +37,9 @@ class OfaForAllTasks(TorchModel):
 
     def __init__(self, model_dir, *args, **kwargs):
         super().__init__(model_dir=model_dir, *args, **kwargs)
-        model = OFAModel.from_pretrained(model_dir)
+        sd = torch.load(osp.join(model_dir, ModelFile.TORCH_MODEL_BIN_FILE))
+        sd = sd if 'meta' not in sd else sd['state_dict']
+        model = OFAModel.from_pretrained(model_dir, state_dict=sd)
         self.cfg = Config.from_file(
             osp.join(model_dir, ModelFile.CONFIGURATION))
         self.model = model.module if hasattr(model, 'module') else model
@@ -65,10 +66,9 @@ class OfaForAllTasks(TorchModel):
         self.gen_type = self.cfg.model.get('gen_type', 'generation')
         assert self.gen_type in ['generation', 'traverse'], \
             'model.gen_type must be in ["generation", "traverse"]'
-        self._device = torch.device('cuda') if torch.cuda.is_available() \
-            else torch.device('cpu')
-        self.eos_item = torch.LongTensor([self.tokenizer.eos_token_id
-                                          ]).to(self._device)
+        self.bos_item = torch.LongTensor([self.tokenizer.bos_token_id])
+        self.pad_item = torch.LongTensor([self.tokenizer.pad_token_id])
+        self.eos_item = torch.LongTensor([self.tokenizer.eos_token_id])
         self.index2ans = {}
         self.ans2label_dict = {}
         self.load_ans2label()
@@ -89,7 +89,8 @@ class OfaForAllTasks(TorchModel):
             self.val_masks_l = []
             self.build_trie()
             sg_args['constraint_trie'] = self.constraint_trie
-        self.model.to(self._device)
+        else:
+            self.constraint_trie = None
         self.generator = sg.SequenceGenerator(**sg_args)
         inference_d = {
             'generation': self._text_gen_inference,
@@ -106,42 +107,52 @@ class OfaForAllTasks(TorchModel):
         }
 
     def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        input = move_to_device(input, self.model.device)
+        if self.model.training:
+            return self.model(**input['net_input'])
+        else:
+            return self.inference(input)
+
+    def inference(self, input: Dict[str, Any]) -> Dict[str, Any]:
         ret = self.task_inference_mapping[self.cfg.task](input)
-        ret['samples'] = input['samples']
+        if 'samples' in input:
+            ret['samples'] = input['samples']
         for key in [
                 OutputKeys.CAPTION, OutputKeys.TEXT, OutputKeys.BOXES,
                 OutputKeys.LABELS, OutputKeys.SCORES
         ]:
-            if key in ret and len(ret[key]) == 1:
-                ret[key] = ret[key][0]
             if key not in ret:
                 ret[key] = None
         return ret
 
-    def postprocess(self, input: Dict[str, Tensor],
-                    **kwargs) -> Dict[str, Tensor]:
+    def postprocess(self, input: Dict[str, Any], **kwargs) -> Dict[str, Any]:
         if self.cfg.task == Tasks.image_captioning:
             caption = input[OutputKeys.CAPTION]
-            caption = caption.translate(self.transtab).strip()
+            result_l = list()
+            for cap in caption:
+                result_l.append(cap.translate(self.transtab).strip())
             input[OutputKeys.CAPTION] = caption
+
         return input
 
     def _text_gen_inference(self, input):
-        input = move_to_device(input, self._device)
-        if 'prefix_tokens' in input:
-            gen_output = self.generator.generate(
-                [self.model], input, prefix_tokens=input['prefix_tokens'])
-        else:
-            gen_output = self.generator.generate([self.model], input)
+        gen_outputs = self.generator.generate([self.model],
+                                              input,
+                                              prefix_tokens=input.get(
+                                                  'prefix_tokens', None))
         gen_l = list()
-        for i in range(len(gen_output)):
-            if 'prefix_tokens' in input:
-                prefix_tokens = input['prefix_tokens']
-                gen_l.append(
-                    gen_output[i][0]['tokens'][len(prefix_tokens[i]):])
+        for idx, gen_out in enumerate(gen_outputs):
+            if len(gen_out) > 0:
+                decode_tokens = gen_out[0]['tokens']
+                if 'prefix_tokens' in input:
+                    prefix_len = input['prefix_tokens'][idx].ne(
+                        self.pad_item.to(self.model.device)).sum()
+                    decode_tokens = decode_tokens[prefix_len:]
+                gen_l.append(decode_tokens)
             else:
-                gen_l.append(gen_output[i][0]['tokens'])
+                gen_l.append('')
         result = self.tokenizer.batch_decode(gen_l, skip_special_tokens=True)
+        result = [item.strip() for item in result]
         # text generation tasks have no score
         ret = {OFA_TASK_KEY_MAPPING[self.cfg.task]: result}
         if self.cfg.task.endswith('classification'):
@@ -149,7 +160,6 @@ class OfaForAllTasks(TorchModel):
         return ret
 
     def _visual_grounding_inference(self, input):
-        input = move_to_device(input, self._device)
         gen_output = self.generator.generate([self.model], input)
         tokens = [gen_output[i][0]['tokens'] for i in range(len(gen_output))]
         region_coord_l = list()
@@ -163,13 +173,12 @@ class OfaForAllTasks(TorchModel):
         region_tensor[:, ::2] /= input['w_resize_ratios']
         region_tensor[:, 1::2] /= input['h_resize_ratios']
         return {
-            OutputKeys.BOXES: move_to_device(region_tensor,
-                                             torch.device('cpu')),
+            OutputKeys.BOXES:
+            move_to_device(region_tensor, torch.device('cpu')).tolist(),
             OutputKeys.SCORES: [1.0] * region_tensor.shape[0]
         }
 
     def _traverse_inference(self, input):
-        input = move_to_device(input, self._device)
         encoder_input = dict()
         for key in input['net_input'].keys():
             encoder_input[key] = input['net_input'][key]
@@ -193,19 +202,19 @@ class OfaForAllTasks(TorchModel):
                 torch.cat([
                     torch.zeros(
                         len(decoder_prompt) - 1,
-                        valid_constraint_mask.size(1)).bool().to(self._device),
+                        valid_constraint_mask.size(1)).bool(),
                     valid_constraint_mask], dim=0)  # yapf: disable
                 for decoder_prompt in input['decoder_prompts']  # yapf: disable
                 for valid_constraint_mask in val_masks]  # yapf: disable
             valid_tgt = collate_tokens(
                 valid_tgt_items,
-                pad_idx=self.tokenizer.pad_token_id).to(self._device)
+                pad_idx=self.tokenizer.pad_token_id).to(self.model.device)
             valid_prev_output = collate_tokens(
                 valid_prev_items,
-                pad_idx=self.tokenizer.pad_token_id).to(self._device)
+                pad_idx=self.tokenizer.pad_token_id).to(self.model.device)
             val_masks = collate_tokens(
                 valid_constraint_mask_items,
-                pad_idx=self.tokenizer.pad_token_id).to(self._device)
+                pad_idx=self.tokenizer.pad_token_id).to(self.model.device)
             new_encoder_out = {
                 'last_hidden_state':
                 encoder_out['last_hidden_state'].repeat_interleave(
@@ -280,8 +289,6 @@ class OfaForAllTasks(TorchModel):
             self.val_masks_l += [
                 constraint_mask_list[i:i + self.val_batch_size]
             ]
-        self.val_ans_l = move_to_device(self.val_ans_l, self._device)
-        self.val_masks_l = move_to_device(self.val_masks_l, self._device)
 
     def load_ans2label(self):
         if self.cfg.model.get('answer2label', None):
diff --git a/modelscope/msdatasets/ms_dataset.py b/modelscope/msdatasets/ms_dataset.py
index 691db4fe..d0d0ab92 100644
--- a/modelscope/msdatasets/ms_dataset.py
+++ b/modelscope/msdatasets/ms_dataset.py
@@ -75,7 +75,7 @@ class MsIterableDataset(torch.utils.data.IterableDataset):
             }
             for preprocessor in self.preprocessor_list:
                 res.update({
-                    k: torch.tensor(v)
+                    k: v  # k: torch.tensor(v)
                     for k, v in preprocessor(item_dict).items()
                     if k in self.retained_columns
                 })
@@ -350,14 +350,15 @@ class MsDataset:
 
         def is_numpy_number(value):
             return np.issubdtype(value.dtype, np.integer) or np.issubdtype(
-                value.dtype, np.floating)
+                value.dtype, np.floating) or np.issubdtype(
+                    value.dtype, np.bool)
 
         retained_columns = []
         for k in sample_res.keys():
             if not is_numpy_number(sample_res[k]):
                 logger.warning(
                     f'Data of column {k} is non-numeric, will be removed')
-                continue
+                # continue
             retained_columns.append(k)
 
         return MsIterableDataset(self._hf_ds, preprocessor_list,
diff --git a/modelscope/pipelines/cv/image_classification_pipeline.py b/modelscope/pipelines/cv/image_classification_pipeline.py
index 49467eab..69dbd1fb 100644
--- a/modelscope/pipelines/cv/image_classification_pipeline.py
+++ b/modelscope/pipelines/cv/image_classification_pipeline.py
@@ -13,6 +13,7 @@ from modelscope.pipelines.base import Input, Model, Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import OfaPreprocessor, Preprocessor, load_image
 from modelscope.utils.constant import Tasks
+from modelscope.utils.device import get_device
 from modelscope.utils.logger import get_logger
 
 logger = get_logger()
@@ -36,6 +37,7 @@ class ImageClassificationPipeline(Pipeline):
         else:
             raise NotImplementedError
         pipe_model.model.eval()
+        pipe_model.to(get_device())
         if preprocessor is None and isinstance(pipe_model, OfaForAllTasks):
             preprocessor = OfaPreprocessor(model_dir=pipe_model.model_dir)
         super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs)
diff --git a/modelscope/preprocessors/multi_modal.py b/modelscope/preprocessors/multi_modal.py
index 930f374b..2416ea86 100644
--- a/modelscope/preprocessors/multi_modal.py
+++ b/modelscope/preprocessors/multi_modal.py
@@ -84,7 +84,12 @@ class OfaPreprocessor(Preprocessor):
 
     def _compatible_with_pretrain(self, data):
         if 'image' in data and self.cfg.model.get('type', None) == 'ofa':
-            image = load_image(data['image'])
+            if isinstance(data['image'], str):
+                image = load_image(data['image'])
+            else:
+                image = data['image']
+            if image.mode != 'RGB':
+                image = image.convert('RGB')
             img_buffer = BytesIO()
             image.save(img_buffer, format='JPEG')
             data['image'] = Image.open(img_buffer)
@@ -102,8 +107,6 @@ class OfaPreprocessor(Preprocessor):
         for k, v in data.items():
             str_data[k] = str(v)
         sample['sample'] = str_data
-        # import pdb
-        # pdb.set_trace()
         if self.no_collate:
             return sample
         else:
diff --git a/modelscope/preprocessors/ofa/base.py b/modelscope/preprocessors/ofa/base.py
index 8bbe02d1..9c6c4d7e 100644
--- a/modelscope/preprocessors/ofa/base.py
+++ b/modelscope/preprocessors/ofa/base.py
@@ -42,6 +42,7 @@ class OfaBasePreprocessor:
             for key, value in tokenizer.get_vocab().items()
         }
         self.max_src_length = cfg.model.get('max_src_length', 256)
+        self.max_tgt_length = cfg.model.get('max_tgt_length', 256)
         self.max_image_size = cfg.model.get('max_image_size', 512)
         self.language = self.cfg.model.get('language', 'en')
         self.prompt_type = self.cfg.model.get('prompt_type', 'none')
@@ -58,22 +59,23 @@ class OfaBasePreprocessor:
             self.std = [0.5, 0.5, 0.5]
         self.patch_image_size = self.cfg.model.get('patch_image_size', 480)
         self.constraint_trie = None
-        self.index2ans = {}
-        if self.cfg.model.get('answer2label', False):
+        if self.cfg.model.get('answer2label', None):
             ans2label_file = osp.join(model_dir, self.cfg.model.answer2label)
             with open(ans2label_file, 'r') as reader:
                 ans2label_dict = json.load(reader)
+            self.ans2label = ans2label_dict
+            self.label2ans = {v: k for k, v in self.ans2label.items()}
             self.constraint_trie = Trie(tokenizer.eos_token_id)
             for i, answer in enumerate(ans2label_dict.keys()):
-                answer_item = tokenizer(
-                    ' ' + answer,
-                    return_tensors='pt',
-                    add_special_tokens=False).input_ids.squeeze(0)
+                answer_item = self.tokenize_text(
+                    ' ' + answer, add_bos=False, add_eos=False)
                 self.constraint_trie.insert([tokenizer.bos_token_id]
                                             + answer_item.tolist()
                                             + [tokenizer.eos_token_id])
 
-    def get_inputs(self, text, add_bos=True, add_eos=True):
+    def tokenize_text(self, text, add_bos=True, add_eos=True):
+        if text is None:
+            return None
         inputs = self.tokenizer(
             text,
             max_length=self.max_src_length,
@@ -88,7 +90,7 @@ class OfaBasePreprocessor:
 
     @staticmethod
     def pre_caption(caption, max_words=None):
-        caption = caption.lower().lstrip(',.!?*#:;~').replace('-', ' ')\
+        caption = caption.lower().lstrip(',.!?*#:;~').replace('-', ' ') \
             .replace('/', ' ').replace('<person>', 'person')
 
         caption = re.sub(
@@ -126,3 +128,18 @@ class OfaBasePreprocessor:
             question = ' '.join(question_words[:max_ques_words])
 
         return question
+
+    def add_constraint_mask(self, sample):
+        target_itm = sample['target']
+        len_label_itm = target_itm.ne(self.pad_item).sum(dim=0).item()
+        if self.constraint_trie:
+            constraint_mask = torch.zeros(
+                (len(target_itm), len(self.tgt_dict))).bool()
+            start_idx = len(target_itm) - len_label_itm
+            for i in range(start_idx, len(target_itm)):
+                constraint_prefix_token = self.bos_item.tolist(
+                ) + target_itm[start_idx:i].tolist()
+                constraint_nodes = self.constraint_trie.get_next_layer(
+                    constraint_prefix_token)
+                constraint_mask[i][constraint_nodes] = True
+            sample['constraint_mask'] = constraint_mask
diff --git a/modelscope/preprocessors/ofa/image_captioning.py b/modelscope/preprocessors/ofa/image_captioning.py
index 884e5ff8..f62f4f1c 100644
--- a/modelscope/preprocessors/ofa/image_captioning.py
+++ b/modelscope/preprocessors/ofa/image_captioning.py
@@ -38,14 +38,29 @@ class OfaImageCaptioningPreprocessor(OfaBasePreprocessor):
         ])
 
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        if self.mode == ModeKeys.TRAIN:
+            return self._build_train_sample(data)
+        else:
+            return self._build_infer_sample(data)
+
+    def _build_infer_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
         image = data['image'] if isinstance(
             data['image'], Image.Image) else load_image(data['image'])
         patch_image = self.patch_resize_transform(image)
         prompt = self.cfg.model.get('prompt', ' what does the image describe?')
-        inputs = self.get_inputs(prompt)
+        inputs = self.tokenize_text(prompt)
         sample = {
             'source': inputs,
             'patch_image': patch_image,
             'patch_mask': torch.tensor([True])
         }
         return sample
+
+    def _build_train_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        sample = self._build_infer_sample(data)
+        target = data['target']
+        target = target.translate(self.transtab).strip()
+        target_token_list = target.strip().split()
+        target = ' '.join(target_token_list[:self.max_tgt_length])
+        sample['target'] = self.tokenize_text(target)
+        return sample
diff --git a/modelscope/preprocessors/ofa/image_classification.py b/modelscope/preprocessors/ofa/image_classification.py
index f4d5c08a..49968823 100644
--- a/modelscope/preprocessors/ofa/image_classification.py
+++ b/modelscope/preprocessors/ofa/image_classification.py
@@ -42,7 +42,7 @@ class OfaImageClassificationPreprocessor(OfaBasePreprocessor):
             data['image'], Image.Image) else load_image(data['image'])
         patch_image = self.patch_resize_transform(image)
         prompt = self.cfg.model.get('prompt', ' what does the image describe?')
-        inputs = self.get_inputs(prompt)
+        inputs = self.tokenize_text(prompt)
         sample = {
             'source': inputs,
             'patch_image': patch_image,
diff --git a/modelscope/preprocessors/ofa/summarization.py b/modelscope/preprocessors/ofa/summarization.py
index 9867954a..cfd3c23d 100644
--- a/modelscope/preprocessors/ofa/summarization.py
+++ b/modelscope/preprocessors/ofa/summarization.py
@@ -31,7 +31,7 @@ class OfaSummarizationPreprocessor(OfaBasePreprocessor):
         prompt = self.cfg.model.get(
             'prompt', ' " {} " Summarize the article with a title: ')
         text = prompt.format(source)
-        inputs = self.get_inputs(text)
+        inputs = self.tokenize_text(text)
         if self.prompt_type == 'none':
             decoder_prompt = self.bos_item
         elif self.prompt_type == 'prev_output':
diff --git a/modelscope/preprocessors/ofa/text_classification.py b/modelscope/preprocessors/ofa/text_classification.py
index 06e35b78..24c4f67e 100644
--- a/modelscope/preprocessors/ofa/text_classification.py
+++ b/modelscope/preprocessors/ofa/text_classification.py
@@ -1,6 +1,8 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Any, Dict
 
+import torch
+
 from modelscope.utils.constant import ModeKeys
 from .base import OfaBasePreprocessor
 
@@ -24,24 +26,56 @@ class OfaTextClassificationPreprocessor(OfaBasePreprocessor):
               self).__init__(cfg, model_dir, mode, *args, **kwargs)
 
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        if self.mode == ModeKeys.TRAIN:
+            return self._build_train_sample(data)
+        else:
+            return self._build_infer_sample(data)
+
+    def _build_instruction(self, data):
         text1 = ' '.join(
             data['text'].lower().strip().split()[:self.max_src_length])
         text2 = ' '.join(
             data['text2'].lower().strip().split()[:self.max_src_length])
         prompt = ' can text1 " {} " imply text2 " {} "?'
         text = prompt.format(text1, text2)
-        inputs = self.get_inputs(text)
+        instruction_itm = self.tokenize_text(text)
+        return instruction_itm
+
+    def _build_train_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        instruction_itm = self._build_instruction(data)
+        assert 'label' in data, 'there must has `label` column in train phase '
+        label = data['label']
+        if self.label2ans:
+            label = self.label2ans[label]  # ans
+        label_itm = self.tokenize_text(f' {label}', add_bos=False)
+        if self.prompt_type == 'none':
+            target_itm = label_itm
+        elif self.prompt_type == 'prev_output':
+            target_itm = torch.cat([instruction_itm[1:-1], label_itm])
+        else:
+            raise NotImplementedError
+        prev_output_itm = torch.cat([self.bos_item, target_itm[:-1]])
+        target_itm[:-len(label_itm)] = self.pad_item
+        sample = {
+            'source': instruction_itm,
+            'target': target_itm,
+            'prev_output_tokens': prev_output_itm,
+        }
+        self.add_constraint_mask(sample)
+        return sample
+
+    def _build_infer_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        instruction_itm = self._build_instruction(data)
         if self.prompt_type == 'none':
-            decoder_prompt = self.bos_item
-        elif self.prompt_type == 'src':
-            decoder_prompt = inputs
+            prefix_token = []
         elif self.prompt_type == 'prev_output':
-            decoder_prompt = inputs[:-1]
+            prefix_token = instruction_itm[:-1]  # remove eos
         else:
             raise NotImplementedError
         sample = {
-            'source': inputs,
-            'decoder_prompt': decoder_prompt,
-            'prefix_token': decoder_prompt[:-1],
+            'source': instruction_itm,
+            'prefix_token': prefix_token,
         }
+        if 'label' in data:
+            sample['label'] = self.label2ans[data['label']]
         return sample
diff --git a/modelscope/preprocessors/ofa/text_to_image_synthesis.py b/modelscope/preprocessors/ofa/text_to_image_synthesis.py
index 83c4e28a..2f6000eb 100644
--- a/modelscope/preprocessors/ofa/text_to_image_synthesis.py
+++ b/modelscope/preprocessors/ofa/text_to_image_synthesis.py
@@ -30,7 +30,7 @@ class OfaTextToImageSynthesisPreprocessor(OfaBasePreprocessor):
         source = ' '.join(
             data['text'].lower().strip().split()[:self.max_src_length])
         source = 'what is the complete image? caption: {}'.format(source)
-        inputs = self.get_inputs(source)
+        inputs = self.tokenize_text(source)
         sample = {
             'source': inputs,
             'patch_images': None,
diff --git a/modelscope/preprocessors/ofa/utils/collate.py b/modelscope/preprocessors/ofa/utils/collate.py
index 7c17c23b..b128c3fb 100644
--- a/modelscope/preprocessors/ofa/utils/collate.py
+++ b/modelscope/preprocessors/ofa/utils/collate.py
@@ -47,6 +47,8 @@ def collate_fn(samples, pad_idx, eos_idx):
         batch['conf'] = torch.cat([s['conf'] for s in samples], dim=0)
     if samples[0].get('ref_dict', None) is not None:
         batch['ref_dict'] = np.array([s['ref_dict'] for s in samples])
+    if samples[0].get('label', None) is not None:
+        batch['labels'] = np.array([s['label'] for s in samples]).tolist()
     if samples[0].get('constraint_mask', None) is not None:
         batch['constraint_masks'] = merge('constraint_mask')
     if samples[0].get('decoder_prompt', None) is not None:
diff --git a/modelscope/preprocessors/ofa/visual_entailment.py b/modelscope/preprocessors/ofa/visual_entailment.py
index 1cc5bc5c..61c3cc6a 100644
--- a/modelscope/preprocessors/ofa/visual_entailment.py
+++ b/modelscope/preprocessors/ofa/visual_entailment.py
@@ -53,7 +53,7 @@ class OfaVisualEntailmentPreprocessor(OfaBasePreprocessor):
             prompt = self.cfg.model.get(
                 'prompt', ' can image and text1 " {} " imply text2 " {} "?')
             text = prompt.format(caption, hypothesis)
-        inputs = self.get_inputs(text)
+        inputs = self.tokenize_text(text)
         if self.prompt_type == 'none':
             decoder_prompt = self.bos_item
         elif self.prompt_type == 'src':
diff --git a/modelscope/preprocessors/ofa/visual_grounding.py b/modelscope/preprocessors/ofa/visual_grounding.py
index 43f80c7b..8b116463 100644
--- a/modelscope/preprocessors/ofa/visual_grounding.py
+++ b/modelscope/preprocessors/ofa/visual_grounding.py
@@ -48,7 +48,7 @@ class OfaVisualGroundingPreprocessor(OfaBasePreprocessor):
         prompt = self.cfg.model.get(
             'prompt', ' which region does the text " {} " describe?')
         text = prompt.format(src_caption)
-        src_item = self.get_inputs(text)
+        src_item = self.tokenize_text(text)
         sample = {
             'source': src_item,
             'patch_image': patch_image,
diff --git a/modelscope/preprocessors/ofa/visual_question_answering.py b/modelscope/preprocessors/ofa/visual_question_answering.py
index 01c22537..11104e7e 100644
--- a/modelscope/preprocessors/ofa/visual_question_answering.py
+++ b/modelscope/preprocessors/ofa/visual_question_answering.py
@@ -42,7 +42,7 @@ class OfaVisualQuestionAnsweringPreprocessor(OfaBasePreprocessor):
             data['image'], Image.Image) else load_image(data['image'])
         patch_image = self.patch_resize_transform(image)
         text = ' {}'.format(data['text'])
-        inputs = self.get_inputs(text)
+        inputs = self.tokenize_text(text)
         if self.prompt_type == 'none':
             decoder_prompt = self.bos_item
         elif self.prompt_type == 'src':
diff --git a/modelscope/trainers/multi_modal/ofa/ofa_trainer.py b/modelscope/trainers/multi_modal/ofa/ofa_trainer.py
index 0c33118e..c17a15f7 100644
--- a/modelscope/trainers/multi_modal/ofa/ofa_trainer.py
+++ b/modelscope/trainers/multi_modal/ofa/ofa_trainer.py
@@ -3,6 +3,7 @@ from functools import partial
 from typing import Dict, Optional
 
 from datasets import load_dataset
+from torch import distributed as dist
 
 from modelscope.metainfo import Trainers
 from modelscope.models.base import Model
@@ -15,7 +16,7 @@ from modelscope.trainers.optimizer.builder import build_optimizer
 from modelscope.utils.config import Config
 from modelscope.utils.constant import ConfigKeys, ModeKeys, ModelFile
 from .ofa_trainer_utils import (AdjustLabelSmoothedCrossEntropyCriterion,
-                                OFADataset, get_schedule)
+                                get_schedule)
 
 
 @TRAINERS.register_module(module_name=Trainers.ofa_tasks)
@@ -36,31 +37,13 @@ class OFATrainer(EpochBasedTrainer):
         preprocessor = {
             ConfigKeys.train:
             OfaPreprocessor(
-                model_dir=model_dir, model=ModeKeys.TRAIN, no_collate=True),
+                model_dir=model_dir, mode=ModeKeys.TRAIN, no_collate=True),
             ConfigKeys.val:
             OfaPreprocessor(
-                model_dir=model_dir, model=ModeKeys.EVAL, no_collate=True),
+                model_dir=model_dir, mode=ModeKeys.EVAL, no_collate=True),
         }
-        # train_dataset = dataset['train'].to_torch_dataset(
-        #     preprocessors=OfaPreprocessor(model_dir=model_dir, model=ModeKeys.TRAIN, no_collate=True),
-        # )
-        # valid_dataset = dataset['valid'].to_torch_dataset(
-        #     preprocessors=OfaPreprocessor(model_dir=model_dir, model=ModeKeys.TRAIN, no_collate=True),
-        # )
-        # train_dataset = OFADataset(
-        #     file_path=cfg.dataset.train_set,
-        #     selected_id_keys=cfg.dataset.selected_id_keys,
-        #     preprocessor=OfaPreprocessor(
-        #         model_dir=model_dir, mode=ModeKeys.TRAIN),
-        # )
-        # val_dataset = OFADataset(
-        #     file_path=cfg.dataset.valid_set,
-        #     selected_id_keys=cfg.dataset.selected_id_keys,
-        #     preprocessor=OfaPreprocessor(
-        #         model_dir=model_dir, mode=ModeKeys.EVAL),
-        # )
         epoch_steps = len(dataset['train']) // (
-            cfg.train.gradient_accumulation_steps
+            cfg.train.optimizer_hook.cumulative_iters
             * cfg.train.dataloader.batch_size_per_gpu)
         cfg.train.lr_scheduler.num_train_steps = epoch_steps * cfg.train.max_epochs
         cfg.train.criterion.tokenizer = model.tokenizer
@@ -78,6 +61,11 @@ class OFATrainer(EpochBasedTrainer):
             pad_idx=model.tokenizer.pad_token_id,
             eos_idx=model.tokenizer.eos_token_id,
         )
+        if 'launcher' not in kwargs and cfg.train.get('launcher', None):
+            kwargs['launcher'] = cfg.train.launcher
+        if 'use_fp16' not in kwargs and cfg.train.get('use_fp16', False):
+            kwargs['use_fp16'] = cfg.train.use_fp16
+
         super().__init__(
             cfg_file=cfg_file,
             model=model,
@@ -91,14 +79,28 @@ class OFATrainer(EpochBasedTrainer):
             **kwargs,
         )
 
-    # def train(self, *args, **kwargs):
-    #     pass
-
-    def evaluate(self,
-                 checkpoint_path: Optional[str] = None,
-                 *args,
-                 **kwargs) -> Dict[str, float]:
-        pass
-
-    def prediction_step(self, model, inputs):
-        pass
+    def train_step(self, model, inputs):
+        model.train()
+        model_outputs = model.forward(inputs)
+        loss, sample_size, logging_output = self.criterion(
+            model_outputs, inputs)
+        train_outputs = {'loss': loss}
+        # add model output info to log
+        if 'log_vars' not in train_outputs:
+            default_keys_pattern = ['loss']
+            match_keys = set([])
+            for key_p in default_keys_pattern:
+                match_keys.update(
+                    [key for key in train_outputs.keys() if key_p in key])
+            log_vars = {}
+            for key in match_keys:
+                value = train_outputs.get(key, None)
+                if value is not None:
+                    if dist.is_available() and dist.is_initialized():
+                        value = value.data.clone()
+                        dist.all_reduce(value.div_(dist.get_world_size()))
+                    log_vars.update({key: value.item()})
+            self.log_buffer.update(log_vars)
+        else:
+            self.log_buffer.update(train_outputs['log_vars'])
+        self.train_outputs = train_outputs
diff --git a/modelscope/trainers/multi_modal/ofa/ofa_trainer_old.py b/modelscope/trainers/multi_modal/ofa/ofa_trainer_old.py
deleted file mode 100644
index 5e41b49b..00000000
--- a/modelscope/trainers/multi_modal/ofa/ofa_trainer_old.py
+++ /dev/null
@@ -1,120 +0,0 @@
-import os
-from os import path as osp
-from typing import Dict, Optional
-
-import torch
-import torch.distributed as dist
-import transformers
-from torch.utils.data import DataLoader
-from torch.utils.data.distributed import DistributedSampler
-
-from modelscope.metainfo import Trainers
-from modelscope.models.base import Model
-from modelscope.preprocessors.multi_modal import OfaPreprocessor
-from modelscope.preprocessors.ofa.utils.collate import collate_fn
-from modelscope.trainers.base import BaseTrainer
-from modelscope.trainers.builder import TRAINERS
-from modelscope.utils.constant import ModeKeys, ModelFile
-from modelscope.utils.logger import get_logger
-from modelscope.utils.torch_utils import init_dist
-from .ofa_trainer_utils import (AdjustLabelSmoothedCrossEntropyCriterion,
-                                OFADataset, get_schedule)
-
-logger = get_logger()
-
-
-@TRAINERS.register_module(module_name=Trainers.ofa_tasks)
-class OFAOldTrainer(BaseTrainer):
-
-    def __init__(self, model: str, *args, **kwargs):
-        model = Model.from_pretrained(model)
-        super().__init__(osp.join(model.model_dir, ModelFile.CONFIGURATION))
-        self.model_dir = model.model_dir
-        self.model = model.model
-        self.device_id = 0
-        self.total_epoch = self.cfg.train.epoch
-        self.train_batch_size = self.cfg.train.batch_size
-        self.val_batch_size = self.cfg.evaluation.batch_size
-        self.save_dir = self.cfg.train.save_dir
-        init_dist(launcher='pytorch')
-        self.train_dataset = OFADataset(
-            file_path=self.cfg.dataset.train_set,
-            selected_id_keys=self.cfg.dataset.selected_id_keys,
-            preprocessor=OfaPreprocessor(
-                model_dir=self.model_dir, split=ModeKeys.TRAIN),
-        )
-        self.val_dataset = OFADataset(
-            file_path=self.cfg.dataset.valid_set,
-            selected_id_keys=self.cfg.dataset.selected_id_keys,
-            preprocessor=OfaPreprocessor(
-                model_dir=self.model_dir, split=ModeKeys.EVAL),
-        )
-        epoch_steps = len(
-            self.train_dataset) // self.cfg.train.gradient_accumulation_steps
-        self.cfg.train.num_train_steps = epoch_steps * self.cfg.train.epoch
-        self.criterion = AdjustLabelSmoothedCrossEntropyCriterion(
-            self.cfg.train.criterion)
-
-    def train(self, *args, **kwargs):
-        assert dist.is_initialized()
-
-        self.model.train()
-        self.model.to(self.device_id)
-        ddp_model = torch.nn.parallel.DistributedDataParallel(
-            self.model, device_ids=[
-                self.device_id,
-            ])
-
-        optimizer = transformers.AdamW(
-            self.model.parameters(),
-            lr=self.cfg.train.lr,
-            weight_decay=self.cfg.train.weight_decay,
-            correct_bias=False,
-        )
-        scheduler_class, scheduler_args = get_schedule(self.cfg.train)
-        if scheduler_class is not None:
-            lr_scheduler = scheduler_class(**{'optimizer': optimizer},
-                                           **scheduler_args)
-        else:
-            lr_scheduler = None
-        for epoch in range(self.total_epoch):
-            train_sampler = DistributedSampler(
-                dataset=self.train_dataset, shuffle=True)
-            train_sampler.set_epoch(epoch)
-
-            train_params = {
-                'pin_memory': True,
-                'collate_fn': collate_fn,
-                'batch_size': self.train_batch_size,
-                'shuffle': False,
-                'drop_last': True,
-                'sampler': train_sampler,
-                'num_workers': 2,
-            }
-
-            train_loader = DataLoader(self.train_dataset, **train_params)
-
-            for idx, batch in enumerate(train_loader, start=1):
-                model_outputs = ddp_model(**batch)
-                loss, sample_size, logging_output = self.criterion(
-                    model_outputs, batch)
-                loss.backward()
-                optimizer.zero_grad()
-                if lr_scheduler is not None:
-                    lr_scheduler.step()
-                optimizer.step()
-                optimizer.zero_grad()
-                if idx % 10 == 0:
-                    logger.info(
-                        'epoch: {}, train batch {}/{}, loss={:.5f}'.format(
-                            epoch, idx, len(train_loader), loss.item()))
-            if dist.get_rank() == 0:
-                os.makedirs(self.ckpt_dir, exist_ok=True)
-                torch.save(ddp_model.module.state_dict(),
-                           f'{self.ckpt_dir}/epoch{epoch}.bin')
-
-    def evaluate(self,
-                 checkpoint_path: Optional[str] = None,
-                 *args,
-                 **kwargs) -> Dict[str, float]:
-        pass
diff --git a/modelscope/trainers/multi_modal/ofa/ofa_trainer_utils.py b/modelscope/trainers/multi_modal/ofa/ofa_trainer_utils.py
index 38a13f4d..cdae21c6 100644
--- a/modelscope/trainers/multi_modal/ofa/ofa_trainer_utils.py
+++ b/modelscope/trainers/multi_modal/ofa/ofa_trainer_utils.py
@@ -172,47 +172,11 @@ class AdjustLabelSmoothedCrossEntropyCriterion(_Loss):
         2) the sample size, which is used as the denominator for the gradient
         3) logging outputs to display while training
         """
-        if isinstance(sample, list):
-            if self.sample_patch_num > 0:
-                sample[0]['net_input'][
-                    'sample_patch_num'] = self.sample_patch_num
-            loss_v1, sample_size_v1, logging_output_v1 = self.forward(
-                output[0], sample[0], update_num, reduce)
-            loss_v2, sample_size_v2, logging_output_v2 = self.forward(
-                output[1], sample[1], update_num, reduce)
-            loss = loss_v1 / sample_size_v1 + loss_v2 / sample_size_v2
-            sample_size = 1
-            logging_output = {
-                'loss':
-                loss.data,
-                'loss_v1':
-                loss_v1.data,
-                'loss_v2':
-                loss_v2.data,
-                'nll_loss':
-                logging_output_v1['nll_loss'].data / sample_size_v1
-                + logging_output_v2['nll_loss'].data / sample_size_v2,
-                'ntokens':
-                logging_output_v1['ntokens'] + logging_output_v2['ntokens'],
-                'nsentences':
-                logging_output_v1['nsentences']
-                + logging_output_v2['nsentences'],
-                'sample_size':
-                1,
-                'sample_size_v1':
-                sample_size_v1,
-                'sample_size_v2':
-                sample_size_v2,
-            }
-            return loss, sample_size, logging_output
-
         if self.use_rdrop:
             construct_rdrop_sample(sample)
 
-        net_output = output
-        # model(**sample["net_input"])
         loss, nll_loss, ntokens = self.compute_loss(
-            net_output, sample, update_num, reduce=reduce)
+            output, sample, update_num, reduce=reduce)
         sample_size = (
             sample['target'].size(0) if self.sentence_avg else ntokens)
         logging_output = {
diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py
index 62378997..6bfdd2a4 100644
--- a/modelscope/trainers/trainer.py
+++ b/modelscope/trainers/trainer.py
@@ -12,6 +12,7 @@ import numpy as np
 import torch
 from torch import distributed as dist
 from torch import nn
+from torch.nn.parallel import DistributedDataParallel
 from torch.utils.data import DataLoader, Dataset
 from torch.utils.data.dataloader import default_collate
 from torch.utils.data.distributed import DistributedSampler
@@ -159,8 +160,6 @@ class EpochBasedTrainer(BaseTrainer):
             train_dataset,
             mode=ModeKeys.TRAIN,
             preprocessor=self.train_preprocessor)
-        # import pdb
-        # pdb.set_trace()
         self.eval_dataset = self.to_task_dataset(
             eval_dataset,
             mode=ModeKeys.EVAL,
@@ -200,7 +199,6 @@ class EpochBasedTrainer(BaseTrainer):
             self._max_epochs = self.cfg.train.max_epochs
         else:
             self._max_epochs = kwargs['max_epochs']
-
         self._train_iters_per_epoch = kwargs.get('train_iters_per_epoch', None)
         self._eval_iters_per_epoch = kwargs.get('val_iters_per_epoch', None)
         if self._train_iters_per_epoch is None and hasattr(
@@ -220,12 +218,12 @@ class EpochBasedTrainer(BaseTrainer):
             init_dist(kwargs['launcher'])
 
         self._dist = get_dist_info()[1] > 1
-
         # model placement
         if self.device.type == 'cuda':
             self.model.to(self.device)
             if not is_parallel(self.model) and self._dist:
                 self.model = self.to_parallel(self.model)
+                self.device = self.model.device
 
     def rebuild_config(self, cfg: Config):
         """A method used to rebuild the config, any subclass can override this method.
@@ -429,7 +427,7 @@ class EpochBasedTrainer(BaseTrainer):
         self.register_hook_from_cfg(self.cfg.train.hooks)
         self.train_loop(self.train_dataloader)
 
-    def evaluate(self, checkpoint_path=None):
+    def evaluate(self, checkpoint_path=None, *arg, **kwargs):
         self.model.eval()
         self._mode = ModeKeys.EVAL
 
@@ -475,12 +473,12 @@ class EpochBasedTrainer(BaseTrainer):
             self.cfg.parallel.update(
                 dict(module=model, device_ids=[torch.cuda.current_device()]))
             return build_parallel(self.cfg.parallel)
-
+        model.to(f'cuda:{torch.cuda.current_device()}')
         dp_cfg = dict(
             type='DistributedDataParallel',
             module=model,
+            find_unused_parameters=True,
             device_ids=[torch.cuda.current_device()])
-
         return build_parallel(dp_cfg)
 
     def train_step(self, model, inputs):
@@ -504,8 +502,10 @@ class EpochBasedTrainer(BaseTrainer):
         model.train()
         self._mode = ModeKeys.TRAIN
         # call model forward but not __call__ to skip postprocess
+        forward_func = model.module.forward if \
+            isinstance(model, DistributedDataParallel) else model.forward
         if isinstance(inputs,
-                      Mapping) and not func_receive_dict_inputs(model.forward):
+                      Mapping) and not func_receive_dict_inputs(forward_func):
             train_outputs = model.forward(**inputs)
         else:
             train_outputs = model.forward(inputs)
@@ -751,7 +751,7 @@ class EpochBasedTrainer(BaseTrainer):
             batch_size = batch_size_per_gpu
             num_workers = workers_per_gpu
 
-        if dist:
+        if dist and not isinstance(dataset, torch.utils.data.IterableDataset):
             sampler = DistributedSampler(
                 dataset, num_replicas=world_size, rank=rank, shuffle=shuffle)
         else:
diff --git a/modelscope/trainers/utils/inference.py b/modelscope/trainers/utils/inference.py
index d368c340..c6a291d9 100644
--- a/modelscope/trainers/utils/inference.py
+++ b/modelscope/trainers/utils/inference.py
@@ -9,6 +9,7 @@ from collections.abc import Mapping
 
 import torch
 from torch import distributed as dist
+from torch.nn.parallel import DistributedDataParallel
 from tqdm import tqdm
 
 from modelscope.utils.data_utils import to_device
@@ -68,7 +69,10 @@ def single_gpu_test(model,
                 batch_size = 1  # iteration count
             else:
                 if isinstance(data, dict):
-                    batch_size = len(next(iter(data.values())))
+                    if 'nsentences' in data:
+                        batch_size = data['nsentences']
+                    else:
+                        batch_size = len(next(iter(data.values())))
                 else:
                     batch_size = len(data)
             for _ in range(batch_size):
@@ -142,28 +146,38 @@ def multi_gpu_test(model,
             data = to_device(data, device)
             data_list.append(data)
             with torch.no_grad():
-                if isinstance(data, Mapping) and not func_receive_dict_inputs(
-                        model.forward):
+                forward_func = model.module.forward if \
+                    isinstance(model, DistributedDataParallel) else model.forward
+                if isinstance(data, Mapping
+                              ) and not func_receive_dict_inputs(forward_func):
                     result = model.forward(**data)
                 else:
                     result = model.forward(data)
             results.append(result)
 
-            if rank == 0:
-                if isinstance(data, dict):
-                    batch_size = len(next(iter(data.values())))
+            if isinstance(data, dict):
+                if 'nsentences' in data:
+                    batch_size = data['nsentences']
                 else:
-                    batch_size = len(data)
-
-                if progress_with_iters:
-                    total_samples += batch_size * world_size
-                    batch_size = 1  # iteration count
+                    batch_size = len(next(iter(data.values())))
+            else:
+                batch_size = len(data)
+            if i >= (data_len // world_size) - 1:
+                total_samples = torch.LongTensor([batch_size]).to(model.device)
+                dist.all_reduce(total_samples, op=dist.reduce_op.SUM)
+                total_samples = total_samples.item()
+            else:
+                total_samples = batch_size * world_size
+            if progress_with_iters:
+                iter_cnt_all = world_size
+            else:
+                iter_cnt_all = total_samples
+                count += iter_cnt_all
 
-                batch_size_all = batch_size * world_size
-                count += batch_size_all
+            if rank == 0:
                 if count > data_len:
-                    batch_size_all = data_len - (count - batch_size_all)
-                for _ in range(batch_size_all):
+                    iter_cnt_all = data_len - (count - iter_cnt_all)
+                for _ in range(iter_cnt_all):
                     pbar.update()
 
             if progress_with_iters and (i + 1) >= data_len:
diff --git a/modelscope/utils/device.py b/modelscope/utils/device.py
index 77e23122..df5470f9 100644
--- a/modelscope/utils/device.py
+++ b/modelscope/utils/device.py
@@ -1,5 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-
+import os
 from contextlib import contextmanager
 
 from modelscope.utils.constant import Devices, Frameworks
@@ -105,3 +105,17 @@ def create_device(device_name):
         device = torch.device('cpu')
 
     return device
+
+
+def get_device():
+    import torch
+    from torch import distributed as dist
+    if torch.cuda.is_available():
+        if dist.is_available() and dist.is_initialized(
+        ) and 'LOCAL_RANK' in os.environ:
+            device_id = f"cuda:{os.environ['LOCAL_RANK']}"
+        else:
+            device_id = 'cuda:0'
+    else:
+        device_id = 'cpu'
+    return torch.device(device_id)
diff --git a/modelscope/utils/multi_modal/forked_pdb.py b/modelscope/utils/multi_modal/forked_pdb.py
new file mode 100644
index 00000000..56107d1f
--- /dev/null
+++ b/modelscope/utils/multi_modal/forked_pdb.py
@@ -0,0 +1,17 @@
+import pdb
+import sys
+
+
+class ForkedPdb(pdb.Pdb):
+    """A Pdb subclass that may be used
+    from a forked multiprocessing child
+
+    """
+
+    def interaction(self, *args, **kwargs):
+        _stdin = sys.stdin
+        try:
+            sys.stdin = open('/dev/stdin')
+            pdb.Pdb.interaction(self, *args, **kwargs)
+        finally:
+            sys.stdin = _stdin
diff --git a/tests/pipelines/test_ofa_tasks.py b/tests/pipelines/test_ofa_tasks.py
index 9044e41a..8779ba48 100644
--- a/tests/pipelines/test_ofa_tasks.py
+++ b/tests/pipelines/test_ofa_tasks.py
@@ -252,6 +252,27 @@ class OfaTasksTest(unittest.TestCase):
         result[OutputKeys.OUTPUT_IMG].save('result.png')
         print(f'Output written to {osp.abspath("result.png")}')
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_visual_question_answering_huge_with_name(self):
+        model = '/apsarapangu/disk2/yichang.zyc/ckpt/MaaS/ofa_visual-question-answering_pretrain_huge_en'
+        ofa_pipe = pipeline(Tasks.visual_question_answering, model=model)
+        image = 'data/test/images/visual_question_answering.png'
+        text = 'what is grown on the plant?'
+        input = {'image': image, 'text': text}
+        result = ofa_pipe(input)
+        print(result)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_image_captioning_huge_with_name(self):
+        model = '/apsarapangu/disk2/yichang.zyc/ckpt/MaaS/ofa_image-caption_coco_huge_en'
+        img_captioning = pipeline(
+            task=Tasks.image_captioning,
+            model=model,
+        )
+        result = img_captioning(
+            {'image': 'data/test/images/image_captioning.png'})
+        print(result[OutputKeys.CAPTION])
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/trainers/test_ofa_trainer.py b/tests/trainers/test_ofa_trainer.py
index af0cf2dc..39d9fe0c 100644
--- a/tests/trainers/test_ofa_trainer.py
+++ b/tests/trainers/test_ofa_trainer.py
@@ -11,6 +11,7 @@ class TestOfaTrainer(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_trainer(self):
+        model_id = '/apsarapangu/disk2/yichang.zyc/ckpt/MaaS/maas_mnli_pretrain_ckpt'
         model_id = '/apsarapangu/disk2/yichang.zyc/ckpt/MaaS/ofa_text-classification_mnli_large_en'
         self.trainer = OFATrainer(model_id)
         self.trainer.train()

From 3b09d848ceeaeda7f27dfd9eeeffa58e3b6a9ec8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=A1=8C=E5=97=94?= <yichang.zyc@alibaba-inc.com>
Date: Wed, 28 Sep 2022 16:02:28 +0800
Subject: [PATCH 06/54] update

---
 .../models/multi_modal/ofa_for_all_tasks.py   |   5 +-
 .../multi_modal/ofa/ofa_file_dataset.py       | 133 ------------------
 .../trainers/multi_modal/ofa/ofa_trainer.py   |   2 +-
 modelscope/trainers/trainer.py                |  17 ++-
 tests/pipelines/test_ofa_tasks.py             |   5 +-
 tests/trainers/test_ofa_trainer.py            |   5 +-
 6 files changed, 21 insertions(+), 146 deletions(-)
 delete mode 100644 modelscope/trainers/multi_modal/ofa/ofa_file_dataset.py

diff --git a/modelscope/models/multi_modal/ofa_for_all_tasks.py b/modelscope/models/multi_modal/ofa_for_all_tasks.py
index ab9b0357..38d1538d 100644
--- a/modelscope/models/multi_modal/ofa_for_all_tasks.py
+++ b/modelscope/models/multi_modal/ofa_for_all_tasks.py
@@ -129,8 +129,7 @@ class OfaForAllTasks(TorchModel):
             result_l = list()
             for cap in caption:
                 result_l.append(cap.translate(self.transtab).strip())
-            input[OutputKeys.CAPTION] = caption
-
+            input[OutputKeys.CAPTION] = result_l
         return input
 
     def _text_gen_inference(self, input):
@@ -182,6 +181,8 @@ class OfaForAllTasks(TorchModel):
             encoder_input[key] = input['net_input'][key]
         encoder_out = self.model.encoder(**encoder_input)
         valid_result = []
+        import pdb
+        pdb.set_trace()
         for val_ans, val_masks in zip(self.val_ans_l, self.val_masks_l):
             valid_size = len(val_ans)
             valid_tgt_items = [
diff --git a/modelscope/trainers/multi_modal/ofa/ofa_file_dataset.py b/modelscope/trainers/multi_modal/ofa/ofa_file_dataset.py
deleted file mode 100644
index 138f1303..00000000
--- a/modelscope/trainers/multi_modal/ofa/ofa_file_dataset.py
+++ /dev/null
@@ -1,133 +0,0 @@
-# Copyright 2022 The OFA-Sys Team.
-# All rights reserved.
-# This source code is licensed under the Apache 2.0 license
-# found in the LICENSE file in the root directory.
-
-import os
-import pickle
-
-import torch
-
-
-class OFAFileDataset:
-
-    def __init__(self,
-                 file_path,
-                 selected_col_ids=None,
-                 dtypes=None,
-                 separator='\t',
-                 cached_index=False):
-        self.file_path = file_path
-        assert os.path.exists(
-            self.file_path), 'Error: The local datafile {} not exists!'.format(
-                self.file_path)
-
-        self.separator = separator
-        if selected_col_ids is None:
-            # default to all fields
-            self.selected_col_ids = list(
-                range(
-                    len(
-                        open(self.file_path).readline().rstrip('\n').split(
-                            self.separator))))
-        else:
-            self.selected_col_ids = [
-                int(col_id) for col_id in selected_col_ids.split(',')
-            ]
-        if dtypes is None:
-            # default to str
-            self.dtypes = [str for col_id in self.selected_col_ids]
-        else:
-            self.dtypes = [eval(col_dtype) for col_dtype in dtypes.split(',')]
-            assert len(self.dtypes) == len(self.selected_col_ids)
-
-        self.data_cnt = 0
-        try:
-            self.slice_id = torch.distributed.get_rank()
-            self.slice_count = torch.distributed.get_world_size()
-        except Exception:
-            self.slice_id = 0
-            self.slice_count = 1
-        self.cached_index = cached_index
-        self._init_seek_index()
-        self._reader = self._get_reader()
-        print('file {} slice_id {} row count {} total row count {}'.format(
-            self.file_path, self.slice_id, self.row_count,
-            self.total_row_count))
-
-    def _init_seek_index(self):
-        if self.cached_index:
-            cache_path = '{}.index'.format(self.file_path)
-            assert os.path.exists(
-                cache_path), 'cache file {} not exists!'.format(cache_path)
-            self.total_row_count, self.lineid_to_offset = pickle.load(
-                open(cache_path, 'rb'))
-            print(
-                'local datafile {} slice_id {} use cached row_count and line_idx-to-offset mapping'
-                .format(self.file_path, self.slice_id))
-        else:
-            # make an iteration over the file to get row_count and line_idx-to-offset mapping
-            fp = open(self.file_path, 'r')
-            print(
-                'local datafile {} slice_id {} begin to initialize row_count and line_idx-to-offset mapping'
-                .format(self.file_path, self.slice_id))
-            self.total_row_count = 0
-            offset = 0
-            self.lineid_to_offset = []
-            for line in fp:
-                self.lineid_to_offset.append(offset)
-                self.total_row_count += 1
-                offset += len(line.encode('utf-8'))
-            pickle.dump(self.lineid_to_offset,
-                        open('{}.index'.format(self.file_path), 'wb'))
-        self._compute_start_pos_and_row_count()
-        print(
-            'local datafile {} slice_id {} finished initializing row_count and line_idx-to-offset mapping'
-            .format(self.file_path, self.slice_id))
-
-    def _compute_start_pos_and_row_count(self):
-        self.row_count = self.total_row_count // self.slice_count
-        if self.slice_id < self.total_row_count - self.row_count * self.slice_count:
-            self.row_count += 1
-            self.start_pos = self.row_count * self.slice_id
-        else:
-            self.start_pos = self.row_count * self.slice_id + (
-                self.total_row_count - self.row_count * self.slice_count)
-
-    def _get_reader(self):
-        fp = open(self.file_path, 'r')
-        fp.seek(self.lineid_to_offset[self.start_pos])
-        return fp
-
-    def _seek(self, offset=0):
-        try:
-            print('slice_id {} seek offset {}'.format(self.slice_id,
-                                                      self.start_pos + offset))
-            self._reader.seek(self.lineid_to_offset[self.start_pos + offset])
-            self.data_cnt = offset
-        except Exception:
-            print('slice_id {} seek offset {}'.format(self.slice_id, offset))
-            self._reader.seek(self.lineid_to_offset[offset])
-            self.data_cnt = offset
-
-    def __del__(self):
-        self._reader.close()
-
-    def __len__(self):
-        return self.row_count
-
-    def get_total_row_count(self):
-        return self.total_row_count
-
-    def __getitem__(self, index):
-        if self.data_cnt == self.row_count:
-            print('reach the end of datafile, start a new reader')
-            self.data_cnt = 0
-            self._reader = self._get_reader()
-        column_l = self._reader.readline().rstrip('\n').split(self.separator)
-        self.data_cnt += 1
-        column_l = [
-            dtype(column_l[col_id])
-            for col_id, dtype in zip(self.selected_col_ids, self.dtypes)
-        ]
-        return column_l
diff --git a/modelscope/trainers/multi_modal/ofa/ofa_trainer.py b/modelscope/trainers/multi_modal/ofa/ofa_trainer.py
index c17a15f7..42a68d02 100644
--- a/modelscope/trainers/multi_modal/ofa/ofa_trainer.py
+++ b/modelscope/trainers/multi_modal/ofa/ofa_trainer.py
@@ -65,7 +65,7 @@ class OFATrainer(EpochBasedTrainer):
             kwargs['launcher'] = cfg.train.launcher
         if 'use_fp16' not in kwargs and cfg.train.get('use_fp16', False):
             kwargs['use_fp16'] = cfg.train.use_fp16
-
+        kwargs['to_tensor'] = False
         super().__init__(
             cfg_file=cfg_file,
             model=model,
diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py
index 793092c8..8412280b 100644
--- a/modelscope/trainers/trainer.py
+++ b/modelscope/trainers/trainer.py
@@ -167,19 +167,20 @@ class EpochBasedTrainer(BaseTrainer):
             device_name = f'cuda:{local_rank}'
 
         self.device = create_device(device_name)
-
         self.train_dataset = self.to_task_dataset(
             train_dataset,
             mode=ModeKeys.TRAIN,
             task_data_config=self.cfg.dataset.get('train', None) if hasattr(
                 self.cfg, 'dataset') else None,
-            preprocessor=self.train_preprocessor)
+            preprocessor=self.train_preprocessor,
+            **kwargs)
         self.eval_dataset = self.to_task_dataset(
             eval_dataset,
             mode=ModeKeys.EVAL,
             task_data_config=self.cfg.dataset.get('val', None) if hasattr(
                 self.cfg, 'dataset') else None,
-            preprocessor=self.eval_preprocessor)
+            preprocessor=self.eval_preprocessor,
+            **kwargs)
 
         self.train_data_collator, self.eval_default_collate = None, None
         if isinstance(data_collator, Mapping):
@@ -305,13 +306,15 @@ class EpochBasedTrainer(BaseTrainer):
                         datasets: Union[Dataset, List[Dataset]],
                         mode: str,
                         task_data_config: Config = None,
-                        preprocessor: Optional[Preprocessor] = None):
+                        preprocessor: Optional[Preprocessor] = None,
+                        **kwargs):
         """Build the task specific dataset processor for this trainer.
 
         Returns: The task dataset processor for the task. If no result for the very model-type and task,
         the default TaskDataset will be returned.
         """
         try:
+            to_tensor = kwargs.get('to_tensor', True)
             if not datasets:
                 return datasets
             if isinstance(datasets, TorchTaskDataset):
@@ -327,7 +330,8 @@ class EpochBasedTrainer(BaseTrainer):
                 return datasets.to_torch_dataset(
                     task_data_config=task_data_config,
                     task_name=self.cfg.task,
-                    preprocessors=preprocessor)
+                    preprocessors=preprocessor,
+                    to_tensor=to_tensor)
             elif isinstance(datasets, List) and isinstance(
                     datasets[0], MsDataset):
                 if task_data_config is None:
@@ -341,7 +345,8 @@ class EpochBasedTrainer(BaseTrainer):
                     d.to_torch_dataset(
                         task_data_config=task_data_config,
                         task_name=self.cfg.task,
-                        preprocessors=preprocessor) for d in datasets
+                        preprocessors=preprocessor,
+                        to_tensor=to_tensor) for d in datasets
                 ]
                 cfg = ConfigDict(
                     type=self.cfg.task, mode=mode, datasets=datasets)
diff --git a/tests/pipelines/test_ofa_tasks.py b/tests/pipelines/test_ofa_tasks.py
index e6638dfa..d89e5d48 100644
--- a/tests/pipelines/test_ofa_tasks.py
+++ b/tests/pipelines/test_ofa_tasks.py
@@ -94,8 +94,11 @@ class OfaTasksTest(unittest.TestCase, DemoCompatibilityCheck):
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_text_classification_with_model(self):
+        # model = Model.from_pretrained(
+        #     'damo/ofa_text-classification_mnli_large_en')
         model = Model.from_pretrained(
-            'damo/ofa_text-classification_mnli_large_en')
+            '/apsarapangu/disk2/yichang.zyc/ckpt/MaaS/ofa_text-classification_mnli_large_en'
+        )
         ofa_pipe = pipeline(Tasks.text_classification, model=model)
         text = 'One of our number will carry out your instructions minutely.'
         text2 = 'A member of my team will execute your orders with immense precision.'
diff --git a/tests/trainers/test_ofa_trainer.py b/tests/trainers/test_ofa_trainer.py
index 39d9fe0c..3948aad7 100644
--- a/tests/trainers/test_ofa_trainer.py
+++ b/tests/trainers/test_ofa_trainer.py
@@ -12,11 +12,10 @@ class TestOfaTrainer(unittest.TestCase):
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_trainer(self):
         model_id = '/apsarapangu/disk2/yichang.zyc/ckpt/MaaS/maas_mnli_pretrain_ckpt'
-        model_id = '/apsarapangu/disk2/yichang.zyc/ckpt/MaaS/ofa_text-classification_mnli_large_en'
-        self.trainer = OFATrainer(model_id)
+        self.trainer = OFATrainer(model_id, launcher='pytorch')
         self.trainer.train()
         if os.path.exists(self.trainer.work_dir):
-            shutil.rmtree(self.trainer.work_dir)
+            pass
 
 
 if __name__ == '__main__':

From 993b944b654f89d7231dc840e7bd12cae1381db8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=A1=8C=E5=97=94?= <yichang.zyc@alibaba-inc.com>
Date: Wed, 28 Sep 2022 16:54:41 +0800
Subject: [PATCH 07/54] update

---
 tests/pipelines/test_ofa_tasks.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/tests/pipelines/test_ofa_tasks.py b/tests/pipelines/test_ofa_tasks.py
index d89e5d48..4bdb394a 100644
--- a/tests/pipelines/test_ofa_tasks.py
+++ b/tests/pipelines/test_ofa_tasks.py
@@ -37,6 +37,19 @@ class OfaTasksTest(unittest.TestCase, DemoCompatibilityCheck):
         result = img_captioning({'image': image})
         print(result[OutputKeys.CAPTION])
 
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_image_captioning_zh_with_model(self):
+        model = Model.from_pretrained(
+            '/apsarapangu/disk2/yichang.zyc/ckpt/MaaS/ofa_image-caption_coco_base_zh'
+        )
+        img_captioning = pipeline(
+            task=Tasks.image_captioning,
+            model=model,
+        )
+        image = 'data/test/images/image_captioning.png'
+        result = img_captioning({'image': image})
+        print(result[OutputKeys.CAPTION])
+
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_image_captioning_with_name(self):
         img_captioning = pipeline(

From a799dd237d807eceef80bf3361f5bd2a0db9ce1d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=A1=8C=E5=97=94?= <yichang.zyc@alibaba-inc.com>
Date: Thu, 29 Sep 2022 15:26:20 +0800
Subject: [PATCH 08/54] remove ofa_file_dataset

---
 .../models/multi_modal/ofa_for_all_tasks.py     | 17 ++++++++++++++++-
 .../multi_modal/ofa/ofa_trainer_utils.py        |  1 -
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/modelscope/models/multi_modal/ofa_for_all_tasks.py b/modelscope/models/multi_modal/ofa_for_all_tasks.py
index 38d1538d..dc2db59c 100644
--- a/modelscope/models/multi_modal/ofa_for_all_tasks.py
+++ b/modelscope/models/multi_modal/ofa_for_all_tasks.py
@@ -1,8 +1,10 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import math
+import os
 import string
+from functools import partial
 from os import path as osp
-from typing import Any, Dict
+from typing import Any, Callable, Dict, List, Optional, Union
 
 import json
 import torch.cuda
@@ -295,3 +297,16 @@ class OfaForAllTasks(TorchModel):
                                       self.cfg.model.answer2label)
             with open(ans2label_file, 'r') as reader:
                 self.ans2label_dict = json.load(reader)
+
+    def save_pretrained(self,
+                        target_folder: Union[str, os.PathLike],
+                        save_checkpoint_names: Union[str, List[str]] = None,
+                        save_function: Callable = None,
+                        config: Optional[dict] = None,
+                        **kwargs):
+        super(OfaForAllTasks, self). \
+            save_pretrained(target_folder=target_folder,
+                            save_checkpoint_names=save_checkpoint_names,
+                            save_function=partial(save_function, with_meta=False),
+                            config=config,
+                            **kwargs)
diff --git a/modelscope/trainers/multi_modal/ofa/ofa_trainer_utils.py b/modelscope/trainers/multi_modal/ofa/ofa_trainer_utils.py
index cdae21c6..ecd8cd1d 100644
--- a/modelscope/trainers/multi_modal/ofa/ofa_trainer_utils.py
+++ b/modelscope/trainers/multi_modal/ofa/ofa_trainer_utils.py
@@ -12,7 +12,6 @@ from torch.nn.modules.loss import _Loss
 from torch.utils.data import Dataset
 
 from modelscope.preprocessors.multi_modal import OfaPreprocessor
-from .ofa_file_dataset import OFAFileDataset
 
 
 class OFADataset(Dataset):

From ac653594d8679278f293556353d69b38c70973b2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=A1=8C=E5=97=94?= <yichang.zyc@alibaba-inc.com>
Date: Fri, 30 Sep 2022 15:49:21 +0800
Subject: [PATCH 09/54] caption finetune done, need add belu

---
 .../models/multi_modal/ofa_for_all_tasks.py   |  2 +-
 modelscope/preprocessors/ofa/base.py          |  4 ++
 .../preprocessors/ofa/image_captioning.py     | 21 +++++-----
 .../hooks/optimizer/torch_optimizer_hook.py   |  1 +
 .../trainers/multi_modal/ofa/ofa_trainer.py   | 39 +++++++++++++------
 modelscope/trainers/trainer.py                |  1 -
 6 files changed, 46 insertions(+), 22 deletions(-)

diff --git a/modelscope/models/multi_modal/ofa_for_all_tasks.py b/modelscope/models/multi_modal/ofa_for_all_tasks.py
index dc2db59c..cf5a8112 100644
--- a/modelscope/models/multi_modal/ofa_for_all_tasks.py
+++ b/modelscope/models/multi_modal/ofa_for_all_tasks.py
@@ -126,7 +126,7 @@ class OfaForAllTasks(TorchModel):
         return ret
 
     def postprocess(self, input: Dict[str, Any], **kwargs) -> Dict[str, Any]:
-        if self.cfg.task == Tasks.image_captioning:
+        if not self.model.training and self.cfg.task == Tasks.image_captioning:
             caption = input[OutputKeys.CAPTION]
             result_l = list()
             for cap in caption:
diff --git a/modelscope/preprocessors/ofa/base.py b/modelscope/preprocessors/ofa/base.py
index 9c6c4d7e..47d70f6d 100644
--- a/modelscope/preprocessors/ofa/base.py
+++ b/modelscope/preprocessors/ofa/base.py
@@ -1,5 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import re
+import string
 from os import path as osp
 
 import json
@@ -58,6 +59,9 @@ class OfaBasePreprocessor:
             self.mean = [0.5, 0.5, 0.5]
             self.std = [0.5, 0.5, 0.5]
         self.patch_image_size = self.cfg.model.get('patch_image_size', 480)
+        self.transtab = str.maketrans(
+            {key: None
+             for key in string.punctuation})
         self.constraint_trie = None
         if self.cfg.model.get('answer2label', None):
             ans2label_file = osp.join(model_dir, self.cfg.model.answer2label)
diff --git a/modelscope/preprocessors/ofa/image_captioning.py b/modelscope/preprocessors/ofa/image_captioning.py
index f62f4f1c..cfc1e243 100644
--- a/modelscope/preprocessors/ofa/image_captioning.py
+++ b/modelscope/preprocessors/ofa/image_captioning.py
@@ -1,4 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import os
 from typing import Any, Dict, Union
 
 import torch
@@ -43,6 +44,17 @@ class OfaImageCaptioningPreprocessor(OfaBasePreprocessor):
         else:
             return self._build_infer_sample(data)
 
+    def _build_train_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        sample = self._build_infer_sample(data)
+        target = data['text']
+        target = target.translate(self.transtab).strip()
+        target_token_list = target.strip().split()
+        target = ' '.join(target_token_list[:self.max_tgt_length])
+        sample['target'] = self.tokenize_text(target, add_bos=False)
+        sample['prev_output_tokens'] = torch.cat(
+            [self.bos_item, sample['target'][:-1]])
+        return sample
+
     def _build_infer_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
         image = data['image'] if isinstance(
             data['image'], Image.Image) else load_image(data['image'])
@@ -55,12 +67,3 @@ class OfaImageCaptioningPreprocessor(OfaBasePreprocessor):
             'patch_mask': torch.tensor([True])
         }
         return sample
-
-    def _build_train_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
-        sample = self._build_infer_sample(data)
-        target = data['target']
-        target = target.translate(self.transtab).strip()
-        target_token_list = target.strip().split()
-        target = ' '.join(target_token_list[:self.max_tgt_length])
-        sample['target'] = self.tokenize_text(target)
-        return sample
diff --git a/modelscope/trainers/hooks/optimizer/torch_optimizer_hook.py b/modelscope/trainers/hooks/optimizer/torch_optimizer_hook.py
index 30ea88a2..2a5ce88a 100644
--- a/modelscope/trainers/hooks/optimizer/torch_optimizer_hook.py
+++ b/modelscope/trainers/hooks/optimizer/torch_optimizer_hook.py
@@ -79,5 +79,6 @@ class TorchAMPOptimizerHook(OptimizerHook):
             self.scaler.step(trainer.optimizer)
             self.scaler.update(self._scale_update_param)
             trainer.optimizer.zero_grad()
+            print('xcxcxcxcxc: optimizer step')
 
         setattr(self._model, 'forward', self._ori_model_forward)
diff --git a/modelscope/trainers/multi_modal/ofa/ofa_trainer.py b/modelscope/trainers/multi_modal/ofa/ofa_trainer.py
index 42a68d02..5c65a129 100644
--- a/modelscope/trainers/multi_modal/ofa/ofa_trainer.py
+++ b/modelscope/trainers/multi_modal/ofa/ofa_trainer.py
@@ -1,6 +1,6 @@
+import math
 import os
 from functools import partial
-from typing import Dict, Optional
 
 from datasets import load_dataset
 from torch import distributed as dist
@@ -27,13 +27,7 @@ class OFATrainer(EpochBasedTrainer):
         model_dir = model.model_dir
         cfg_file = os.path.join(model_dir, ModelFile.CONFIGURATION)
         cfg = Config.from_file(cfg_file)
-        dataset = load_dataset(
-            cfg.dataset.script,
-            data_files=cfg.dataset.hf_dataset,
-            sep=cfg.dataset.sep,
-        )
-        dataset = MsDataset.from_hf_dataset(
-            dataset.rename_columns(cfg.dataset.column_map))
+        dataset = self._build_dataset_with_config(cfg)
         preprocessor = {
             ConfigKeys.train:
             OfaPreprocessor(
@@ -42,9 +36,11 @@ class OFATrainer(EpochBasedTrainer):
             OfaPreprocessor(
                 model_dir=model_dir, mode=ModeKeys.EVAL, no_collate=True),
         }
-        epoch_steps = len(dataset['train']) // (
-            cfg.train.optimizer_hook.cumulative_iters
-            * cfg.train.dataloader.batch_size_per_gpu)
+        # use torchrun launch
+        world_size = int(os.environ.get('WORLD_SIZE', 1))
+        epoch_steps = math.ceil(
+            len(dataset['train']) /  # noqa
+            (cfg.train.dataloader.batch_size_per_gpu * world_size))  # noqa
         cfg.train.lr_scheduler.num_train_steps = epoch_steps * cfg.train.max_epochs
         cfg.train.criterion.tokenizer = model.tokenizer
         self.criterion = AdjustLabelSmoothedCrossEntropyCriterion(
@@ -104,3 +100,24 @@ class OFATrainer(EpochBasedTrainer):
         else:
             self.log_buffer.update(train_outputs['log_vars'])
         self.train_outputs = train_outputs
+
+    def _build_dataset_with_config(self, cfg):
+        if hasattr(cfg.dataset, 'hf_dataset'):
+            dataset = load_dataset(
+                cfg.dataset.script,
+                data_files=cfg.dataset.hf_dataset,
+                sep=cfg.dataset.sep,
+            )
+            dataset = MsDataset.from_hf_dataset(
+                dataset.rename_columns(cfg.dataset.column_map))
+            return dataset
+        elif hasattr(cfg.dataset, 'ms_dataset'):
+            dataset_d = dict()
+            for key in cfg.dataset.ms_dataset.keys():
+                dataset_d[key] = MsDataset.load(**cfg.dataset.ms_dataset[key])
+                dataset_d[key] = MsDataset.from_hf_dataset(
+                    dataset_d[key]._hf_ds.rename_columns(
+                        cfg.dataset.column_map))
+            return dataset_d
+        else:
+            raise NotImplementedError
diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py
index 824f0091..cb3436e1 100644
--- a/modelscope/trainers/trainer.py
+++ b/modelscope/trainers/trainer.py
@@ -216,7 +216,6 @@ class EpochBasedTrainer(BaseTrainer):
             self._max_epochs = self.cfg.train.max_epochs
         else:
             self._max_epochs = kwargs['max_epochs']
-
         self._train_iters_per_epoch = kwargs.get('train_iters_per_epoch', None)
         self._eval_iters_per_epoch = kwargs.get('val_iters_per_epoch', None)
         if self._train_iters_per_epoch is None and hasattr(

From dbf022efe87ef61577eaed8aedcf8d05722ba681 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=A1=8C=E5=97=94?= <yichang.zyc@alibaba-inc.com>
Date: Fri, 30 Sep 2022 17:44:35 +0800
Subject: [PATCH 10/54] caption finetune done, add belu

---
 modelscope/metainfo.py                        |  3 ++
 modelscope/metrics/__init__.py                |  4 ++
 modelscope/metrics/accuracy_metric.py         |  2 +-
 modelscope/metrics/bleu_metric.py             | 42 +++++++++++++++++++
 .../models/multi_modal/ofa_for_all_tasks.py   |  2 -
 .../preprocessors/ofa/image_captioning.py     |  2 +
 .../hooks/optimizer/torch_optimizer_hook.py   |  1 -
 requirements/multi-modal.txt                  |  1 +
 tests/trainers/test_ofa_trainer.py            |  9 ++--
 9 files changed, 58 insertions(+), 8 deletions(-)
 create mode 100644 modelscope/metrics/bleu_metric.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 42f04461..f94d4103 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -334,6 +334,9 @@ class Metrics(object):
     accuracy = 'accuracy'
     audio_noise_metric = 'audio-noise-metric'
 
+    # text gen
+    bleu = 'bleu'
+
     # metrics for image denoise task
     image_denoise_metric = 'image-denoise-metric'
 
diff --git a/modelscope/metrics/__init__.py b/modelscope/metrics/__init__.py
index d3975a2c..90d7db7b 100644
--- a/modelscope/metrics/__init__.py
+++ b/modelscope/metrics/__init__.py
@@ -17,6 +17,8 @@ if TYPE_CHECKING:
     from .token_classification_metric import TokenClassificationMetric
     from .video_summarization_metric import VideoSummarizationMetric
     from .movie_scene_segmentation_metric import MovieSceneSegmentationMetric
+    from .accuracy_metric import AccuracyMetric
+    from .bleu_metric import BleuMetric
 
 else:
     _import_structure = {
@@ -34,6 +36,8 @@ else:
         'token_classification_metric': ['TokenClassificationMetric'],
         'video_summarization_metric': ['VideoSummarizationMetric'],
         'movie_scene_segmentation_metric': ['MovieSceneSegmentationMetric'],
+        'accuracy_metric': ['AccuracyMetric'],
+        'bleu_metric': ['BleuMetric'],
     }
 
     import sys
diff --git a/modelscope/metrics/accuracy_metric.py b/modelscope/metrics/accuracy_metric.py
index 0f73ce64..aab9a138 100644
--- a/modelscope/metrics/accuracy_metric.py
+++ b/modelscope/metrics/accuracy_metric.py
@@ -11,7 +11,7 @@ from .builder import METRICS, MetricKeys
 
 @METRICS.register_module(group_key=default_group, module_name=Metrics.accuracy)
 class AccuracyMetric(Metric):
-    """The metric computation class for sequence classification classes.
+    """The metric computation class for classification classes.
 
     This metric class calculates accuracy for the whole input batches.
     """
diff --git a/modelscope/metrics/bleu_metric.py b/modelscope/metrics/bleu_metric.py
new file mode 100644
index 00000000..43d1b105
--- /dev/null
+++ b/modelscope/metrics/bleu_metric.py
@@ -0,0 +1,42 @@
+from itertools import zip_longest
+from typing import Dict
+
+import sacrebleu
+
+from modelscope.metainfo import Metrics
+from modelscope.utils.registry import default_group
+from .base import Metric
+from .builder import METRICS, MetricKeys
+
+EVAL_BLEU_ORDER = 4
+
+
+@METRICS.register_module(group_key=default_group, module_name=Metrics.bleu)
+class BleuMetric(Metric):
+    """The metric computation bleu for text generation classes.
+
+    This metric class calculates accuracy for the whole input batches.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.eval_tokenized_bleu = kwargs.get('eval_tokenized_bleu', False)
+        self.hyp_name = kwargs.get('hyp_name', 'hyp')
+        self.ref_name = kwargs.get('ref_name', 'ref')
+        self.refs = list()
+        self.hyps = list()
+
+    def add(self, outputs: Dict, inputs: Dict):
+        self.refs.extend(inputs[self.ref_name])
+        self.hyps.extend(outputs[self.hyp_name])
+
+    def evaluate(self):
+        if self.eval_tokenized_bleu:
+            bleu = sacrebleu.corpus_bleu(
+                self.hyps, list(zip_longest(*self.refs)), tokenize='none')
+        else:
+            bleu = sacrebleu.corpus_bleu(self.hyps,
+                                         list(zip_longest(*self.refs)))
+        return {
+            MetricKeys.BLEU_4: bleu.score,
+        }
diff --git a/modelscope/models/multi_modal/ofa_for_all_tasks.py b/modelscope/models/multi_modal/ofa_for_all_tasks.py
index cf5a8112..7ca01d7f 100644
--- a/modelscope/models/multi_modal/ofa_for_all_tasks.py
+++ b/modelscope/models/multi_modal/ofa_for_all_tasks.py
@@ -183,8 +183,6 @@ class OfaForAllTasks(TorchModel):
             encoder_input[key] = input['net_input'][key]
         encoder_out = self.model.encoder(**encoder_input)
         valid_result = []
-        import pdb
-        pdb.set_trace()
         for val_ans, val_masks in zip(self.val_ans_l, self.val_masks_l):
             valid_size = len(val_ans)
             valid_tgt_items = [
diff --git a/modelscope/preprocessors/ofa/image_captioning.py b/modelscope/preprocessors/ofa/image_captioning.py
index cfc1e243..6c842aa9 100644
--- a/modelscope/preprocessors/ofa/image_captioning.py
+++ b/modelscope/preprocessors/ofa/image_captioning.py
@@ -66,4 +66,6 @@ class OfaImageCaptioningPreprocessor(OfaBasePreprocessor):
             'patch_image': patch_image,
             'patch_mask': torch.tensor([True])
         }
+        if 'text' in data:
+            sample['label'] = data['text']
         return sample
diff --git a/modelscope/trainers/hooks/optimizer/torch_optimizer_hook.py b/modelscope/trainers/hooks/optimizer/torch_optimizer_hook.py
index 2a5ce88a..30ea88a2 100644
--- a/modelscope/trainers/hooks/optimizer/torch_optimizer_hook.py
+++ b/modelscope/trainers/hooks/optimizer/torch_optimizer_hook.py
@@ -79,6 +79,5 @@ class TorchAMPOptimizerHook(OptimizerHook):
             self.scaler.step(trainer.optimizer)
             self.scaler.update(self._scale_update_param)
             trainer.optimizer.zero_grad()
-            print('xcxcxcxcxc: optimizer step')
 
         setattr(self._model, 'forward', self._ori_model_forward)
diff --git a/requirements/multi-modal.txt b/requirements/multi-modal.txt
index 02e87baa..255f6155 100644
--- a/requirements/multi-modal.txt
+++ b/requirements/multi-modal.txt
@@ -5,6 +5,7 @@ pycocotools>=2.0.4
 # rough-score was just recently updated from 0.0.4 to 0.0.7
 # which introduced compatability issues that are being investigated
 rouge_score<=0.0.4
+sacrebleu
 taming-transformers-rom1504
 timm
 tokenizers
diff --git a/tests/trainers/test_ofa_trainer.py b/tests/trainers/test_ofa_trainer.py
index 3948aad7..c0704061 100644
--- a/tests/trainers/test_ofa_trainer.py
+++ b/tests/trainers/test_ofa_trainer.py
@@ -9,13 +9,14 @@ from modelscope.utils.test_utils import test_level
 
 class TestOfaTrainer(unittest.TestCase):
 
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_trainer(self):
-        model_id = '/apsarapangu/disk2/yichang.zyc/ckpt/MaaS/maas_mnli_pretrain_ckpt'
-        self.trainer = OFATrainer(model_id, launcher='pytorch')
+        model_id = 'damo/ofa_image-caption_coco_huge_en'
+        self.trainer = OFATrainer(model_id)
+        os.makedirs(self.trainer.work_dir, exist_ok=True)
         self.trainer.train()
         if os.path.exists(self.trainer.work_dir):
-            pass
+            shutil.rmtree(self.trainer.work_dir)
 
 
 if __name__ == '__main__':

From bd0a020a7fbc32503befa33e138763fed665c7f0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=A1=8C=E5=97=94?= <yichang.zyc@alibaba-inc.com>
Date: Fri, 30 Sep 2022 17:48:51 +0800
Subject: [PATCH 11/54] fix tests

---
 tests/pipelines/test_ofa_tasks.py | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/tests/pipelines/test_ofa_tasks.py b/tests/pipelines/test_ofa_tasks.py
index 4bdb394a..d89e5d48 100644
--- a/tests/pipelines/test_ofa_tasks.py
+++ b/tests/pipelines/test_ofa_tasks.py
@@ -37,19 +37,6 @@ class OfaTasksTest(unittest.TestCase, DemoCompatibilityCheck):
         result = img_captioning({'image': image})
         print(result[OutputKeys.CAPTION])
 
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
-    def test_run_with_image_captioning_zh_with_model(self):
-        model = Model.from_pretrained(
-            '/apsarapangu/disk2/yichang.zyc/ckpt/MaaS/ofa_image-caption_coco_base_zh'
-        )
-        img_captioning = pipeline(
-            task=Tasks.image_captioning,
-            model=model,
-        )
-        image = 'data/test/images/image_captioning.png'
-        result = img_captioning({'image': image})
-        print(result[OutputKeys.CAPTION])
-
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_image_captioning_with_name(self):
         img_captioning = pipeline(

From 7ccf40b6256fa80bb31221e5ad8c91ced49de12d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=A1=8C=E5=97=94?= <yichang.zyc@alibaba-inc.com>
Date: Sun, 2 Oct 2022 23:21:58 +0800
Subject: [PATCH 12/54] fix device mis match

---
 modelscope/models/multi_modal/ofa_for_all_tasks.py |  5 +++--
 tests/pipelines/test_ofa_tasks.py                  | 13 -------------
 2 files changed, 3 insertions(+), 15 deletions(-)

diff --git a/modelscope/models/multi_modal/ofa_for_all_tasks.py b/modelscope/models/multi_modal/ofa_for_all_tasks.py
index 7ca01d7f..41ca1f0b 100644
--- a/modelscope/models/multi_modal/ofa_for_all_tasks.py
+++ b/modelscope/models/multi_modal/ofa_for_all_tasks.py
@@ -187,13 +187,14 @@ class OfaForAllTasks(TorchModel):
             valid_size = len(val_ans)
             valid_tgt_items = [
                 torch.cat([
-                    torch.tensor(decoder_prompt[1:]), valid_answer,
+                    torch.tensor(decoder_prompt[1:]).to('cpu'), valid_answer,
                     self.eos_item
                 ]) for decoder_prompt in input['decoder_prompts']
                 for valid_answer in val_ans
             ]
             valid_prev_items = [
-                torch.cat([torch.tensor(decoder_prompt), valid_answer])
+                torch.cat(
+                    [torch.tensor(decoder_prompt).to('cpu'), valid_answer])
                 for decoder_prompt in input['decoder_prompts']
                 for valid_answer in val_ans
             ]
diff --git a/tests/pipelines/test_ofa_tasks.py b/tests/pipelines/test_ofa_tasks.py
index 4bdb394a..d89e5d48 100644
--- a/tests/pipelines/test_ofa_tasks.py
+++ b/tests/pipelines/test_ofa_tasks.py
@@ -37,19 +37,6 @@ class OfaTasksTest(unittest.TestCase, DemoCompatibilityCheck):
         result = img_captioning({'image': image})
         print(result[OutputKeys.CAPTION])
 
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
-    def test_run_with_image_captioning_zh_with_model(self):
-        model = Model.from_pretrained(
-            '/apsarapangu/disk2/yichang.zyc/ckpt/MaaS/ofa_image-caption_coco_base_zh'
-        )
-        img_captioning = pipeline(
-            task=Tasks.image_captioning,
-            model=model,
-        )
-        image = 'data/test/images/image_captioning.png'
-        result = img_captioning({'image': image})
-        print(result[OutputKeys.CAPTION])
-
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_image_captioning_with_name(self):
         img_captioning = pipeline(

From 4bfceb01a3e5a8d58dcea780b44986d775a14014 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=A1=8C=E5=97=94?= <yichang.zyc@alibaba-inc.com>
Date: Sun, 9 Oct 2022 19:41:06 +0800
Subject: [PATCH 13/54] remove unuse code

---
 .../multi_modal/ofa/ofa_trainer_utils.py      | 40 -------------------
 1 file changed, 40 deletions(-)

diff --git a/modelscope/trainers/multi_modal/ofa/ofa_trainer_utils.py b/modelscope/trainers/multi_modal/ofa/ofa_trainer_utils.py
index ecd8cd1d..b2e54ec6 100644
--- a/modelscope/trainers/multi_modal/ofa/ofa_trainer_utils.py
+++ b/modelscope/trainers/multi_modal/ofa/ofa_trainer_utils.py
@@ -9,46 +9,6 @@ import torch
 import torch.nn.functional as F
 import transformers
 from torch.nn.modules.loss import _Loss
-from torch.utils.data import Dataset
-
-from modelscope.preprocessors.multi_modal import OfaPreprocessor
-
-
-class OFADataset(Dataset):
-
-    def __init__(self,
-                 file_path: str,
-                 preprocessor: OfaPreprocessor,
-                 selected_id_keys: str,
-                 dtypes=None,
-                 separator='\t',
-                 cached_index=False,
-                 **kwargs):
-        assert selected_id_keys is not None
-        selected_col_ids = list()
-        selected_col_keys = list()
-        for id_key in selected_id_keys.split(','):
-            id, key = id_key.split(':')
-            selected_col_ids.append(id)
-            selected_col_keys.append(key)
-
-        self.dataset = OFAFileDataset(
-            file_path=file_path,
-            selected_col_ids=','.join(selected_col_ids),
-            dtypes=dtypes,
-            separator=separator,
-            cached_index=cached_index)
-        self.preprocessor = preprocessor
-
-    def __len__(self):
-        return len(self.dataset)
-
-    def __getitem__(self, index):
-        values = self.dataset[index]
-        data = dict()
-        for key, value in zip(self.selected_col_keys, values):
-            data[key] = value
-        return self.preprocessor(data)
 
 
 def construct_rdrop_sample(x):

From 466b36942f52d7f426570ef52e7db3daf99341cd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=A1=8C=E5=97=94?= <yichang.zyc@alibaba-inc.com>
Date: Mon, 10 Oct 2022 15:12:32 +0800
Subject: [PATCH 14/54] merge master

---
 modelscope/models/multi_modal/ofa/utils/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/modelscope/models/multi_modal/ofa/utils/__init__.py b/modelscope/models/multi_modal/ofa/utils/__init__.py
index 76b03eeb..b937315b 100644
--- a/modelscope/models/multi_modal/ofa/utils/__init__.py
+++ b/modelscope/models/multi_modal/ofa/utils/__init__.py
@@ -1,2 +1 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from .constant import OFA_TASK_KEY_MAPPING

From ca72f5329c921a179c7c7b624e497adbee66c4bc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=A1=8C=E5=97=94?= <yichang.zyc@alibaba-inc.com>
Date: Tue, 18 Oct 2022 16:39:52 +0800
Subject: [PATCH 15/54] commit code

---
 modelscope/metrics/accuracy_metric.py                 | 2 ++
 modelscope/models/multi_modal/ofa/adaptor/__init__.py | 0
 modelscope/models/multi_modal/ofa/modeling_ofa.py     | 1 +
 modelscope/trainers/multi_modal/ofa/__init__.py       | 2 ++
 modelscope/trainers/multi_modal/ofa/ofa_trainer.py    | 2 ++
 5 files changed, 7 insertions(+)
 create mode 100644 modelscope/models/multi_modal/ofa/adaptor/__init__.py

diff --git a/modelscope/metrics/accuracy_metric.py b/modelscope/metrics/accuracy_metric.py
index aab9a138..1761786e 100644
--- a/modelscope/metrics/accuracy_metric.py
+++ b/modelscope/metrics/accuracy_metric.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Dict
 
 import numpy as np
diff --git a/modelscope/models/multi_modal/ofa/adaptor/__init__.py b/modelscope/models/multi_modal/ofa/adaptor/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/multi_modal/ofa/modeling_ofa.py b/modelscope/models/multi_modal/ofa/modeling_ofa.py
index bc749b46..0a7a2ce6 100755
--- a/modelscope/models/multi_modal/ofa/modeling_ofa.py
+++ b/modelscope/models/multi_modal/ofa/modeling_ofa.py
@@ -54,6 +54,7 @@ OFA_PRETRAINED_MODEL_ARCHIVE_LIST = [
     'ofa-medium',
     'ofa-base',
     'ofa-large',
+    'ofa-huge',
 ]
 
 try:
diff --git a/modelscope/trainers/multi_modal/ofa/__init__.py b/modelscope/trainers/multi_modal/ofa/__init__.py
index 7222c48c..34e4ec7a 100644
--- a/modelscope/trainers/multi_modal/ofa/__init__.py
+++ b/modelscope/trainers/multi_modal/ofa/__init__.py
@@ -1 +1,3 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from .ofa_trainer import OFATrainer
diff --git a/modelscope/trainers/multi_modal/ofa/ofa_trainer.py b/modelscope/trainers/multi_modal/ofa/ofa_trainer.py
index 5c65a129..3daadf43 100644
--- a/modelscope/trainers/multi_modal/ofa/ofa_trainer.py
+++ b/modelscope/trainers/multi_modal/ofa/ofa_trainer.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import math
 import os
 from functools import partial

From e76f5a96a3e0a5130ed00b30d827bb92f325a35f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=A1=8C=E5=97=94?= <yichang.zyc@alibaba-inc.com>
Date: Wed, 19 Oct 2022 17:34:59 +0800
Subject: [PATCH 16/54] fix comments

---
 modelscope/metainfo.py                     |  2 +-
 modelscope/preprocessors/multi_modal.py    |  2 ++
 modelscope/utils/multi_modal/forked_pdb.py | 17 -----------------
 tests/pipelines/test_ofa_tasks.py          |  5 +----
 tests/trainers/test_ofa_trainer.py         | 19 ++++++++++++++-----
 5 files changed, 18 insertions(+), 27 deletions(-)
 delete mode 100644 modelscope/utils/multi_modal/forked_pdb.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 0b4291f0..c3fe5594 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -278,7 +278,7 @@ class Trainers(object):
 
     # multi-modal trainers
     clip_multi_modal_embedding = 'clip-multi-modal-embedding'
-    ofa_tasks = 'ofa-tasks-trainer'
+    ofa_tasks = 'ofa'
 
     # cv trainers
     image_instance_segmentation = 'image-instance-segmentation'
diff --git a/modelscope/preprocessors/multi_modal.py b/modelscope/preprocessors/multi_modal.py
index 6d06bbb9..73742c47 100644
--- a/modelscope/preprocessors/multi_modal.py
+++ b/modelscope/preprocessors/multi_modal.py
@@ -83,6 +83,8 @@ class OfaPreprocessor(Preprocessor):
         return data
 
     def _compatible_with_pretrain(self, data):
+        # 预训练的时候使用的image都是经过pil转换的，PIL save的时候一般会进行有损压缩，为了保证和预训练一致
+        # 所以增加了这个逻辑
         if 'image' in data and self.cfg.model.get('type', None) == 'ofa':
             if isinstance(data['image'], str):
                 image = load_image(data['image'])
diff --git a/modelscope/utils/multi_modal/forked_pdb.py b/modelscope/utils/multi_modal/forked_pdb.py
deleted file mode 100644
index 56107d1f..00000000
--- a/modelscope/utils/multi_modal/forked_pdb.py
+++ /dev/null
@@ -1,17 +0,0 @@
-import pdb
-import sys
-
-
-class ForkedPdb(pdb.Pdb):
-    """A Pdb subclass that may be used
-    from a forked multiprocessing child
-
-    """
-
-    def interaction(self, *args, **kwargs):
-        _stdin = sys.stdin
-        try:
-            sys.stdin = open('/dev/stdin')
-            pdb.Pdb.interaction(self, *args, **kwargs)
-        finally:
-            sys.stdin = _stdin
diff --git a/tests/pipelines/test_ofa_tasks.py b/tests/pipelines/test_ofa_tasks.py
index 104c2869..f8366508 100644
--- a/tests/pipelines/test_ofa_tasks.py
+++ b/tests/pipelines/test_ofa_tasks.py
@@ -91,11 +91,8 @@ class OfaTasksTest(unittest.TestCase, DemoCompatibilityCheck):
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_text_classification_with_model(self):
-        # model = Model.from_pretrained(
-        #     'damo/ofa_text-classification_mnli_large_en')
         model = Model.from_pretrained(
-            '/apsarapangu/disk2/yichang.zyc/ckpt/MaaS/ofa_text-classification_mnli_large_en'
-        )
+            'damo/ofa_text-classification_mnli_large_en')
         ofa_pipe = pipeline(Tasks.text_classification, model=model)
         text = 'One of our number will carry out your instructions minutely.'
         text2 = 'A member of my team will execute your orders with immense precision.'
diff --git a/tests/trainers/test_ofa_trainer.py b/tests/trainers/test_ofa_trainer.py
index c0704061..8aab3544 100644
--- a/tests/trainers/test_ofa_trainer.py
+++ b/tests/trainers/test_ofa_trainer.py
@@ -1,9 +1,12 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import glob
 import os
+import os.path as osp
 import shutil
 import unittest
 
-from modelscope.trainers.multi_modal.ofa import OFATrainer
+from modelscope.metainfo import Trainers
+from modelscope.trainers import build_trainer
 from modelscope.utils.test_utils import test_level
 
 
@@ -11,10 +14,16 @@ class TestOfaTrainer(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_trainer(self):
-        model_id = 'damo/ofa_image-caption_coco_huge_en'
-        self.trainer = OFATrainer(model_id)
-        os.makedirs(self.trainer.work_dir, exist_ok=True)
-        self.trainer.train()
+        os.environ['LOCAL_RANK'] = '0'
+        model_id = 'damo/ofa_text-classification_mnli_large_en'
+        default_args = {'model': model_id}
+        trainer = build_trainer(
+            name=Trainers.ofa_tasks, default_args=default_args)
+        os.makedirs(trainer.work_dir, exist_ok=True)
+        trainer.train()
+        assert len(
+            glob.glob(osp.join(trainer.work_dir,
+                               'best_epoch*_accuracy*.pth'))) == 2
         if os.path.exists(self.trainer.work_dir):
             shutil.rmtree(self.trainer.work_dir)
 

From 63a62c315121b07296a807e1266ef55388b11180 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=A1=8C=E5=97=94?= <yichang.zyc@alibaba-inc.com>
Date: Wed, 19 Oct 2022 17:36:01 +0800
Subject: [PATCH 17/54] fix comments

---
 data/test/text/mnli/train.tsv | 101 ----------------------------------
 data/test/text/mnli/valid.tsv |  11 ----
 2 files changed, 112 deletions(-)
 delete mode 100644 data/test/text/mnli/train.tsv
 delete mode 100644 data/test/text/mnli/valid.tsv

diff --git a/data/test/text/mnli/train.tsv b/data/test/text/mnli/train.tsv
deleted file mode 100644
index 83746457..00000000
--- a/data/test/text/mnli/train.tsv
+++ /dev/null
@@ -1,101 +0,0 @@
-sentence1	sentence2	label	sentence1_genre
-Alarm bells would not start ringing until these efforts-which could take five minutes or more-were tried and had failed.	Alarm bells would not start until efforts had failed.	1	nineeleven:Alarm bells would not start ringing until these efforts-which could take five minutes or more-were tried and had failed.
-In those countries where dialect study is undertaken, dialectologists observe that there are today many factors militating against the strict maintenance of older dialect  the standardization of terminology as adopted by national periodicals, news services, radio, and television; the establishment of  prestige  dialects and, through the media, their promulgation; and the huge population shifts that have taken place, particularly in the U.S. since WWII.	Outside of the U.S., this phenomenon is most prominently seen in the other countries involved in WWII.	0	verbatim:In those countries where dialect study is undertaken, dialectologists observe that there are today many factors militating against the strict maintenance of older dialect  the standardization of terminology as adopted by national periodicals, news services, radio, and television; the establishment of  prestige  dialects and, through the media, their promulgation; and the huge population shifts that have taken place, particularly in the U.S. since WWII.
-In the hands of parents and teachers lies the awesome responsibility of conveying to the next generation the intellectual, scientific, aesthetic, and moral achievements that dierentiate our species from others.	Parents have the responsibility to convey to the next generation the scientific achievements humans have made.	1	oup:In the hands of parents and teachers lies the awesome responsibility of conveying to the next generation the intellectual, scientific, aesthetic, and moral achievements that dierentiate our species from others.
-By 9:20, Indianapolis Center learned that there were other hijacked aircraft, and began to doubt its initial assumption that American 77 had crashed.	American 77 was confirmed to have crashed in an unrelated incident.	2	nineeleven:By 9:20, Indianapolis Center learned that there were other hijacked aircraft, and began to doubt its initial assumption that American 77 had crashed.
-How about making their publicity buyer-friendlier as well?	We need to have less of an input into publicity from buyers.	2	verbatim:How about making their publicity buyer-friendlier as well?
-He liked to do little league and soccer with him, and we did all the things families do.	He liked to engage in typical family activities with him, like soccer and little league.	1	letters:He liked to do little league and soccer with him, and we did all the things families do.
-Business units adopting both bar codes and EDI are therefore able to reduce the transaction costs for processing information about sales and orders.	Business units use either bar codes or EDI.	0	oup:Business units adopting both bar codes and EDI are therefore able to reduce the transaction costs for processing information about sales and orders.
-When bar codes and EDI are combined with advanced shipping practices, the benefit of each practice is enhanced; order processing occurs more rapidly, accurately, and with less paper.	Bar codes and EDI are synergistic with advanced shipping practice. 	1	oup:When bar codes and EDI are combined with advanced shipping practices, the benefit of each practice is enhanced; order processing occurs more rapidly, accurately, and with less paper.
-In all the following cases, the spelling, (apparent) roots, or sound of the word actively suggest a meaning different from the true one.	The real meaning of the word is separate to its roots, spelling, and sound.	1	verbatim:In all the following cases, the spelling, (apparent) roots, or sound of the word actively suggest a meaning different from the true one.
-In 2001, with Bin Ladin's help they re-formed into an organization called Ansar al Islam.	Bin Ladin helped reform a group called Ansar al Islam.	1	nineeleven:In 2001, with Bin Ladin's help they re-formed into an organization called Ansar al Islam.
-We are pleased to tell you of a very exciting development with the fund, which has reached a market value of $750,000.	The fund has a market value of $750,000 because we invested heavily in a ponzi scheme.	0	letters:We are pleased to tell you of a very exciting development with the fund, which has reached a market value of $750,000.
-Men say that, too, of course.	Women are the only ones who say that.	2	verbatim:Men say that, too, of course.
-The jagged heavy line in Figure 6.5 (page 100) depicts a typical inventory pattern for a replenishable product like our blue jeans in size 8. Note that the inventory level drops gradually as consumers purchase the item.	The clean straight line in Figure 6.5 illustrates the inventory pattern for replenishible products.	2	oup:The jagged heavy line in Figure 6.5 (page 100) depicts a typical inventory pattern for a replenishable product like our blue jeans in size 8. Note that the inventory level drops gradually as consumers purchase the item.
-Our 90th Birthday celebration began in July and will continue through February.	The celebration will include a promotion for sales lasting for the duration of the celebration.	0	letters:Our 90th Birthday celebration began in July and will continue through February.
-And, you know, with this, you know, it wasn't many opportunities for kids to be special, because kids weren't, you know, you were pushed out of adult conversation, and just really pushed to the side.	Kids were so very special, even being included in adult conversations and given multiple opportunities.	2	facetoface:And, you know, with this, you know, it wasn't many opportunities for kids to be special, because kids weren't, you know, you were pushed out of adult conversation, and just really pushed to the side.
-As a participant in the Chancellor's Circle or Chancellor's Associates, you will receive reports from Jerry Bepko on how he puts your gifts to work.	You will receive reports from Jerry as frequently as you request. 	0	letters:As a participant in the Chancellor's Circle or Chancellor's Associates, you will receive reports from Jerry Bepko on how he puts your gifts to work.
-Um, Christmas is coming up pretty soon huh?	It's soon going to be our Christmas party.	0	facetoface:Um, Christmas is coming up pretty soon huh?
--The new Masters in Planning degree;	The Masters of Planning degree has been around for a very long time.	2	letters:-The new Masters in Planning degree;
-She responded by throwing down the block and turning to another activity.	She responded by abandoning the block, and engaging in another activity.	1	oup:She responded by throwing down the block and turning to another activity.
-Appreciate it.	I'm forever grateful.	0	nineeleven:Appreciate it.
-This book is a good introduction to the subject (in England); those familiar with dialectology in America, and those interested in the study in England or, indeed, generally would be well advised to add Word Maps to their libraries.	The book describes differences between American English and British English. 	0	verbatim:This book is a good introduction to the subject (in England); those familiar with dialectology in America, and those interested in the study in England or, indeed, generally would be well advised to add Word Maps to their libraries.
-One gets the impression that the editors of L used the good stuff from the W and substituted their own, much better material when they encountered some of the bad stuff.	The movie mashup editors were surprised how well the lifted L material meshed with their contributions. 	0	verbatim:One gets the impression that the editors of L used the good stuff from the W and substituted their own, much better material when they encountered some of the bad stuff.
-I hope you will take this opportunity to make a contribution to support SEND's homeownership work.	I hope you'll make a contribution to support the work SEND does.	1	letters:I hope you will take this opportunity to make a contribution to support SEND's homeownership work.
-By the 1990s, high birthrates and declining rates of infant mortality had produced a common problem throughout the Muslim  a large, steadily increasing population of young men without any reasonable expectation of suitable or steady employment-a sure prescription for social turbulence.	The Muslims have a high number of births.	1	nineeleven:By the 1990s, high birthrates and declining rates of infant mortality had produced a common problem throughout the Muslim  a large, steadily increasing population of young men without any reasonable expectation of suitable or steady employment-a sure prescription for social turbulence.
-FAA headquarters had by this time established an open line of communication with the Command Center at Herndon and instructed it to poll all its centers about suspect aircraft.	FAA headquarters refused to communicate with the Command Center at Herndon.	2	nineeleven:FAA headquarters had by this time established an open line of communication with the Command Center at Herndon and instructed it to poll all its centers about suspect aircraft.
-Hani Hanjour, assigned to seat 1B (first class), soon followed.	Hani Hanji was assigned to seat 1b most of the year.	0	nineeleven:Hani Hanjour, assigned to seat 1B (first class), soon followed.
-But of what use is a long entry on spoonerisms?	What what can this long entry do for us other than make us tired?	0	verbatim:But of what use is a long entry on spoonerisms?
-What we are able to accomplish each year is a direct result of your generosity and your understanding of what it takes to provide the best legal education we possibly can.	Your understanding has an effect on what we can accomplish. 	1	letters:What we are able to accomplish each year is a direct result of your generosity and your understanding of what it takes to provide the best legal education we possibly can.
-I want to know much of you.	I don't have much time so we have to talk about you now or never.	0	verbatim:I want to know much of you.
-I am pleased to tell you that we have had a positive response to the letter.	We have had a positive response to the letter because we include drugs in the envelope. 	0	letters:I am pleased to tell you that we have had a positive response to the letter.
-At eight or ten stitches an inch, it is possible to seam thirteen to sixteen or more inches a second.	Seaming between 13 and 15 inches per second is the ideal speed.	0	oup:At eight or ten stitches an inch, it is possible to seam thirteen to sixteen or more inches a second.
-An English authority on dictionaries, James Root Hulbert, says that The Concise Oxford is the best for literary use in Britain and Chambers the best  for general British use. 	The consise Oxford dictionary is the best one in all circumstances. 	2	verbatim:An English authority on dictionaries, James Root Hulbert, says that The Concise Oxford is the best for literary use in Britain and Chambers the best  for general British use.
-At 8:51, the controller noticed the transponder change from United 175 and tried to contact the aircraft.	The transponder code on United 175 changed and the controller tried contacting them.	1	nineeleven:At 8:51, the controller noticed the transponder change from United 175 and tried to contact the aircraft.
-Captain Victor Saracini and First Officer Michael Horrocks piloted the Boeing 767, which had seven flight attendants.	There were seven flight attendants aboard the Boeing 767.	1	nineeleven:Captain Victor Saracini and First Officer Michael Horrocks piloted the Boeing 767, which had seven flight attendants.
-Fulfillment of this goal requires full participation from members of the Indiana Dental Association.	In order to reach our goal we need full participation from members of the dental association.	1	letters:Fulfillment of this goal requires full participation from members of the Indiana Dental Association.
-We put the baby mallard in a small aviary with the half-grown muscovy, and it worked.	The mallard and the muscovy shared the aviary. 	1	letters:We put the baby mallard in a small aviary with the half-grown muscovy, and it worked.
-The President said he remembered such a conversation, and that it reminded him of when he had been an interceptor pilot.	The President said nothing about the conversation in question.	2	nineeleven:The President said he remembered such a conversation, and that it reminded him of when he had been an interceptor pilot.
-The information-integrated channels developed in the United States, which are now influencing sourcing patterns from Mexico and the Caribbean Basin, have begun to affect the textile and apparel sectors worldwide.	Information-integrated channels have also been adopted in Europe more recently.	0	oup:The information-integrated channels developed in the United States, which are now influencing sourcing patterns from Mexico and the Caribbean Basin, have begun to affect the textile and apparel sectors worldwide.
-The average tuition for a one-day C.E. course is about $125.	The average tuition for a one-day C.E. course is over $100, but for an extra $50 you get the textbook included. 	0	letters:The average tuition for a one-day C.E. course is about $125.
-However, these are difficult times for public institutions of higher education, because legislative appropriations are either flat or in the decline.	At the moment higher education institutions are thriving 	2	letters:However, these are difficult times for public institutions of higher education, because legislative appropriations are either flat or in the decline.
-For example, James Garner's Rockford dubbed as a Japanese tenor is a reminder of one's firm awareness of Garner's American tone and timbre.	James Garner's Rockford dubbed as a Spanish tenor is quite impressive.	2	verbatim:For example, James Garner's Rockford dubbed as a Japanese tenor is a reminder of one's firm awareness of Garner's American tone and timbre.
-He worked, he's a teacher, and at that time he worked as the principal of that school, of that school, because it was a, like a high school, there was, from first (grade) to high school.	The man is a stripper, and a damn good one at that.	2	facetoface:He worked, he's a teacher, and at that time he worked as the principal of that school, of that school, because it was a, like a high school, there was, from first (grade) to high school.
-Uh, my mom took me for a it, um, doctor's visit uh, it was a physical.	My mom took me to the doctors for a physical.	1	facetoface:Uh, my mom took me for a it, um, doctor's visit uh, it was a physical.
-The forecasting and inventory models presented in this chapter are not new; they have been recommended for years by statisticians and operations researchers.	The inventory operations presented in this chapter all take a lot of time to implement.	0	oup:The forecasting and inventory models presented in this chapter are not new; they have been recommended for years by statisticians and operations researchers.
-Gifts of $40.00 add up to provide valuable funding.	Valuable funding can be made up of gifts of $40.00.	1	letters:Gifts of $40.00 add up to provide valuable funding.
-The mission of the Social Health Association of Central Indiana is to promote healthy behavior and responsible relationships through sexuality education and life skills training.	 Social Health Association of Central Indiana wants to promote healthy behaviors through sex ed and life skills training.	1	letters:The mission of the Social Health Association of Central Indiana is to promote healthy behavior and responsible relationships through sexuality education and life skills training.
-To begin with, the adoption of bar codes came before rapid replenishment arrangements because retailers required a low-cost means of collecting information at the detailed product level for their own use'that is, they first developed an efficient method for scanning prices at the check-out register and tracking products for internal inventory purposes.	There are several cheap methods for retailer information collection, but bar codes are the best.	0	oup:To begin with, the adoption of bar codes came before rapid replenishment arrangements because retailers required a low-cost means of collecting information at the detailed product level for their own use'that is, they first developed an efficient method for scanning prices at the check-out register and tracking products for internal inventory purposes.
-From that point of view the differing interpretations Mr. Anson and I read into the passage are of secondary importance.	Mr. Anson was an expert at political interpretations.	0	verbatim:From that point of view the differing interpretations Mr. Anson and I read into the passage are of secondary importance.
-But we know that at 10:31, General Larry Arnold instructed his staff to broadcast the following over a NORAD instant messaging  10:31 Vice president has cleared to us to intercept tracks of interest and shoot them down if they do not respond per [General Arnold].	General Larry Arnold told his staff to broadcast over a NORAD messaging service at 10:31 that the Vice president had authorized the shooting down of hijacked planes, they did so immediately.	0	nineeleven:But we know that at 10:31, General Larry Arnold instructed his staff to broadcast the following over a NORAD instant messaging  10:31 Vice president has cleared to us to intercept tracks of interest and shoot them down if they do not respond per [General Arnold].
-We are leaders in the bar, business, government, and community affairs.	We are the dregs of the community affairs and we know it.	2	letters:We are leaders in the bar, business, government, and community affairs.
-He died in a ferryboat accident on Lake Victoria just a few days after Bin Ladin arrived in Jalalabad, leaving Bin Ladin with a need to replace him not only in the Shura but also as supervisor of the cells and prospective operations in East Africa.	After his untimely death, Bin Ladin was forced to replace his roles in the Shura and in supervising cells in East Africa.	1	nineeleven:He died in a ferryboat accident on Lake Victoria just a few days after Bin Ladin arrived in Jalalabad, leaving Bin Ladin with a need to replace him not only in the Shura but also as supervisor of the cells and prospective operations in East Africa.
-Letters in support or condemnation of the QES program (though one may assume they will insist on programme ) should be addressed to Mrs Anne Shelley, Secretary, Queen's English Society, 3 Manor Crescent, Guildford GU2 6NF, England.	Mrs. Anne Shelley is in charge of the QES program.	2	verbatim:Letters in support or condemnation of the QES program (though one may assume they will insist on programme ) should be addressed to Mrs Anne Shelley, Secretary, Queen's English Society, 3 Manor Crescent, Guildford GU2 6NF, England.
-This was done because of the organization of work in clothing shops; the low capital costs and high proportion of labor costs, especially in women's wear for contract shops; the intense product competition among manufacturers within and among geographic markets; and the diversity of products and changing styles.	This was done because of how workers in clothing shops were organized according to experience.	0	oup:This was done because of the organization of work in clothing shops; the low capital costs and high proportion of labor costs, especially in women's wear for contract shops; the intense product competition among manufacturers within and among geographic markets; and the diversity of products and changing styles.
-Cancel, and tear to pieces, that great bond Which keeps me pale!	Remove and destroy the thing that keeps me pale!	1	verbatim:Cancel, and tear to pieces, that great bond Which keeps me pale!
-Between 8:25 and 8:32, in accordance with the FAA protocol, Boston Center managers started notifying their chain of command that American 11 had been hijacked.	it was not until 9:00 that Boston Center messengers realized that American 11 had been hijacked.	2	nineeleven:Between 8:25 and 8:32, in accordance with the FAA protocol, Boston Center managers started notifying their chain of command that American 11 had been hijacked.
-I love it!	I hate it. 	2	facetoface:I love it!
-Instead, in a number of cases their rulers sought to buy off local Islamist movements by ceding control of many social and educational issues.	This is why so much violence has been directed away from their native countries.	0	nineeleven:Instead, in a number of cases their rulers sought to buy off local Islamist movements by ceding control of many social and educational issues.
-The time saved in production can be lost if the distribution method is slow, or if there are other impediments to the movement of products from the apparel-maker to the retailer.	The shortened production time would be wasted if the distribution is slow.	1	oup:The time saved in production can be lost if the distribution method is slow, or if there are other impediments to the movement of products from the apparel-maker to the retailer.
-Periodically, the Islamic world has seen surges of what, for want of a better term, is often labeled fundamentalism.	Fundamentalism periodically surfaces in Islamic countries.	1	nineeleven:Periodically, the Islamic world has seen surges of what, for want of a better term, is often labeled fundamentalism.
-He told us that by the time he arrived, the order had already been passed down NORAD's chain of command.	He told us that the order had been sent down from the FAA.	2	nineeleven:He told us that by the time he arrived, the order had already been passed down NORAD's chain of command.
-But uh, we hear a lot about home and how it used to be and like I said walking five miles to school and--	We like to know what the old days are like.	0	facetoface:But uh, we hear a lot about home and how it used to be and like I said walking five miles to school and--
-noisome Has nothing to do with sound or decibel level, but means simply unpleasant or disgusting.	Noisome had something to do with the lights, however.	0	verbatim:noisome Has nothing to do with sound or decibel level, but means simply unpleasant or disgusting.
-However, it didn't seem to be so horrifying.	It did not seem to be so scary.	1	facetoface:However, it didn't seem to be so horrifying.
- Hold on a second.	Wait a second.	1	nineeleven: Hold on a second.
- What did it look like?	What did it look like to you?	1	oup: What did it look like?
-Mass customization of this sort also means that a single garment must pass through the sewing room at a time.	The complex customization of the garment requires every worker's attention.	0	oup:Mass customization of this sort also means that a single garment must pass through the sewing room at a time.
-Cobuild and CED are far from being such polar opposites, but they exemplify this general point.	Cobuild and CED are polar opposites, no matter which way you look at it.	2	verbatim:Cobuild and CED are far from being such polar opposites, but they exemplify this general point.
-The flight did not respond.	The flight responded.  	2	nineeleven:The flight did not respond.
-Here we will show how a decision tool can be used to make the transition from general intuition to specific decisions about (1) which products to make in each plant and (2) how to schedule the time and quantity of production for each product.	Here we are going to show how a decision tool can be useful in making the transition from intuition to specific decision.	1	oup:Here we will show how a decision tool can be used to make the transition from general intuition to specific decisions about (1) which products to make in each plant and (2) how to schedule the time and quantity of production for each product.
-The air defense of America began with this call.	America's air defense had already begun before the call was made.	2	nineeleven:The air defense of America began with this call.
-[I]mmigrants are cheap and controllable.	German immigrants are easy to afford and control.	0	oup:[I]mmigrants are cheap and controllable.
-(It should be noted that Johnson made the same kind of adaptation of another poem by Juvenal, Satire X, calling it The Vanity of Human Wishes.	Johnson made no adaptations to poems by Juvenal.	2	verbatim:(It should be noted that Johnson made the same kind of adaptation of another poem by Juvenal, Satire X, calling it The Vanity of Human Wishes.
-Callers reported that a passenger had been stabbed and that two people were lying on the floor of the cabin, injured or dead-possibly the captain and first officer.	No one called from the airplane at all.  	2	nineeleven:Callers reported that a passenger had been stabbed and that two people were lying on the floor of the cabin, injured or dead-possibly the captain and first officer.
-Many Americans have wondered, Why do 'they' hate us?	Americans wonder why they hate them.	0	nineeleven:Many Americans have wondered, Why do 'they' hate us?
-It is my (wholly unsubstantiated) guess that the majority of Soviet personnel in Vietnam are in fact not Russian.	It is likely that most of them are from other places	0	verbatim:It is my (wholly unsubstantiated) guess that the majority of Soviet personnel in Vietnam are in fact not Russian.
-We thought it would be cool to see just how far a BB would shoot.	We didn't have a BB gun.	2	facetoface:We thought it would be cool to see just how far a BB would shoot.
-For Indianapolis, that public university must be IUPUI.	The IUPUI university is the only public university in town.	0	letters:For Indianapolis, that public university must be IUPUI.
-Hopefully, all of us can do more internal marketing with our young patients to encourage them to consider the field of Dental Assisting.	No one should be encouraged to consider being a dental assistant. 	2	letters:Hopefully, all of us can do more internal marketing with our young patients to encourage them to consider the field of Dental Assisting.
-NEADS decided to keep the Otis fighters over New York.	The fighters were on guard to destroy any airplane.	0	nineeleven:NEADS decided to keep the Otis fighters over New York.
-He trained in desktop publishing and combined his enthusiastic work ethic with new-found skills in a burgeoning industry. 	This person learned about publishing.	1	letters:He trained in desktop publishing and combined his enthusiastic work ethic with new-found skills in a burgeoning industry.
-Who would have been telling you those stories?	Who did you tell those stories to?	2	facetoface:Who would have been telling you those stories?
-So, I have my sister's kid here and I'm going to kill him underneath this vehicle shortly.	My sister does not have a child. 	2	facetoface:So, I have my sister's kid here and I'm going to kill him underneath this vehicle shortly.
-According to one report, Saddam Hussein's efforts at this time to rebuild relations with the Saudis and other Middle Eastern regimes led him to stay clear of Bin Ladin.	It was because of his time spent making up for past actions that Saddam Hussein did not come in contact with Bin Laden. 	0	nineeleven:According to one report, Saddam Hussein's efforts at this time to rebuild relations with the Saudis and other Middle Eastern regimes led him to stay clear of Bin Ladin.
-He motioned to me as if they were going to cut off his head.	He made a rude gesture at me with one of his fingers.	2	facetoface:He motioned to me as if they were going to cut off his head.
-It is not at once apparent why the book is styled an  almanac,  but that is  there is no other book I know of that contains as much diverse information about American writers as this one.	The book is extremely thorough and well written. 	1	verbatim:It is not at once apparent why the book is styled an  almanac,  but that is  there is no other book I know of that contains as much diverse information about American writers as this one.
-Here the answer is a definite yes.	The answer is yes.	1	oup:Here the answer is a definite yes.
-Today, Bodenheim's novel might be of interest to students of the English language because of its use of slang.	Bodenheim's novel might be of interest to students of French Cuisine because of its use of recipes.	2	verbatim:Today, Bodenheim's novel might be of interest to students of the English language because of its use of slang.
-Each week's demand has been divided by the average demand over the twenty-four weeks; therefore, the average weekly demand is simply equal to 1.0 on this normalized scale.	Weekly demand is divided by the twenty four week average.	1	oup:Each week's demand has been divided by the average demand over the twenty-four weeks; therefore, the average weekly demand is simply equal to 1.0 on this normalized scale.
-Even though I had freedom when I was, you know, home, whatever, but I still had a curfew.	I had freedom when I was home, and there was no curfew, yahoo!	2	facetoface:Even though I had freedom when I was, you know, home, whatever, but I still had a curfew.
-Thus,  Step down (or back) and give me a shot  was readily understood.	Therefore, statements referring to giving me a shot were comprehended.	1	verbatim:Thus,  Step down (or back) and give me a shot  was readily understood.
-For Indianapolis, that public university must be IUPUI.	IUPUI is in the city of Chicago.	2	letters:For Indianapolis, that public university must be IUPUI.
-I'm sending this follow-up letter to let you know that your support is greatly needed and appreciated by everyone involved with graduate Endodontics at IU.	I have sent you 50 letters before this follow-up letter because you refuse to answer any other letters.	0	letters:I'm sending this follow-up letter to let you know that your support is greatly needed and appreciated by everyone involved with graduate Endodontics at IU.
-The Herron School of Art and Gallery of Indiana University is contemporary art!	The gallery at the Herron school displays contemporary art.	1	letters:The Herron School of Art and Gallery of Indiana University is contemporary art!
-So, I'm kind of like the hope, I guess.	I suppose I'm the hope, or something.	1	facetoface:So, I'm kind of like the hope, I guess.
-Please donate today.	Our website is down please come back tomorrow to make a donation.	2	letters:Please donate today.
-Do you watch that?	Can you see?	2	facetoface:Do you watch that?
-To a Western ear, the most predictable of language traits, perhaps, is the well-advertised Japanese use of r for our l .  Indeed, in my travels about Honshu during a three-month visit, I did hear  coinrocker,   see you rater,   Adurt Graphics  (dirty books),  blackwrrants  (hit-and-miss rendering of black walnuts),  Coffee Corombia  (a chain of coffee shops), and  Coconut Glove. 	To the Western ear, the least predictable of language traits are perhaps the most well-advertised use of r.	2	verbatim:To a Western ear, the most predictable of language traits, perhaps, is the well-advertised Japanese use of r for our l .  Indeed, in my travels about Honshu during a three-month visit, I did hear  coinrocker,   see you rater,   Adurt Graphics  (dirty books),  blackwrrants  (hit-and-miss rendering of black walnuts),  Coffee Corombia  (a chain of coffee shops), and  Coconut Glove.
-The recorder captured the sounds of loud thumps, crashes, shouts, and breaking glasses and plates.	The recorder didn't capture any of the sounds.	2	nineeleven:The recorder captured the sounds of loud thumps, crashes, shouts, and breaking glasses and plates.
-That's a good attitude!	You feel good about this, don't you? 	0	facetoface:That's a good attitude!
-Bloomer (for `flower'), butter (for `ram'), or even flower (for `river') are recurrent examples, but solvers must always be on the alert for new traps of this 	Bloomer is another word for flower, butter is for ram and flower for river. 	1	verbatim:Bloomer (for `flower'), butter (for `ram'), or even flower (for `river') are recurrent examples, but solvers must always be on the alert for new traps of this
diff --git a/data/test/text/mnli/valid.tsv b/data/test/text/mnli/valid.tsv
deleted file mode 100644
index dd720865..00000000
--- a/data/test/text/mnli/valid.tsv
+++ /dev/null
@@ -1,11 +0,0 @@
-sentence1	sentence2	label	sentence1_genre
-The new rights are nice enough	Everyone really likes the newest benefits 	0	slate:The new rights are nice enough
-This site includes a list of all award winners and a searchable database of Government Executive articles.	The Government Executive articles housed on the website are not able to be searched.	2	government:This site includes a list of all award winners and a searchable database of Government Executive articles.
-uh i don't know i i have mixed emotions about him uh sometimes i like him but at the same times i love to see somebody beat him	I like him for the most part, but would still enjoy seeing someone beat him.	1	telephone:uh i don't know i i have mixed emotions about him uh sometimes i like him but at the same times i love to see somebody beat him
-yeah i i think my favorite restaurant is always been the one closest  you know the closest as long as it's it meets the minimum criteria you know of good food	My favorite restaurants are always at least a hundred miles away from my house. 	2	telephone:yeah i i think my favorite restaurant is always been the one closest  you know the closest as long as it's it meets the minimum criteria you know of good food
-i don't know um do you do a lot of camping	I know exactly.	2	telephone:i don't know um do you do a lot of camping
-well that would be a help i wish they would do that here we have got so little landfill space left that we're going to run out before the end of this decade and it's really going to be	We have plenty of space in the landfill.	2	telephone:well that would be a help i wish they would do that here we have got so little landfill space left that we're going to run out before the end of this decade and it's really going to be
-yeah i know and i did that all through college and it worked too	I did that all through college but it never worked 	2	telephone:yeah i know and i did that all through college and it worked too
-Calcutta seems to be the only other production center having any pretensions to artistic creativity at all, but ironically you're actually more likely to see the works of Satyajit Ray or Mrinal Sen shown in Europe or North America than in India itself.	Most of Mrinal Sen's work can be found in European collections.	0	travel:Calcutta seems to be the only other production center having any pretensions to artistic creativity at all, but ironically you're actually more likely to see the works of Satyajit Ray or Mrinal Sen shown in Europe or North America than in India itself.
-If that investor were willing to pay extra for the security of limited downside, she could buy put options with a strike price of $98, which would lock in her profit on the shares at $18, less whatever the options cost.	THe strike price could be $8.	2	slate:If that investor were willing to pay extra for the security of limited downside, she could buy put options with a strike price of $98, which would lock in her profit on the shares at $18, less whatever the options cost.
-3)  Dare you rise to the occasion, like Raskolnikov, and reject the petty rules that govern lesser men?	Would you rise up and defeaat all evil lords in the town?	0	slate:3)  Dare you rise to the occasion, like Raskolnikov, and reject the petty rules that govern lesser men?

From 9b8cfc4ecefb96696ca673e0775dbc46930ae84e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=BF=8E=E8=88=AA?= <liugao.lg@alibaba-inc.com>
Date: Thu, 20 Oct 2022 22:32:41 +0800
Subject: [PATCH 18/54] modify ofatrainer

---
 .../trainers/multi_modal/ofa/ofa_trainer.py   | 15 ++++----
 tests/trainers/test_ofa_trainer.py            | 35 +++++++++++++++++--
 2 files changed, 40 insertions(+), 10 deletions(-)

diff --git a/modelscope/trainers/multi_modal/ofa/ofa_trainer.py b/modelscope/trainers/multi_modal/ofa/ofa_trainer.py
index 3daadf43..474a6772 100644
--- a/modelscope/trainers/multi_modal/ofa/ofa_trainer.py
+++ b/modelscope/trainers/multi_modal/ofa/ofa_trainer.py
@@ -24,12 +24,13 @@ from .ofa_trainer_utils import (AdjustLabelSmoothedCrossEntropyCriterion,
 @TRAINERS.register_module(module_name=Trainers.ofa_tasks)
 class OFATrainer(EpochBasedTrainer):
 
-    def __init__(self, model: str, *args, **kwargs):
+    def __init__(self, model: str, cfg_file, work_dir, train_dataset,
+                 eval_dataset, *args, **kwargs):
         model = Model.from_pretrained(model)
         model_dir = model.model_dir
-        cfg_file = os.path.join(model_dir, ModelFile.CONFIGURATION)
+        # cfg_file = os.path.join(model_dir, ModelFile.CONFIGURATION)
         cfg = Config.from_file(cfg_file)
-        dataset = self._build_dataset_with_config(cfg)
+        # dataset = self._build_dataset_with_config(cfg)
         preprocessor = {
             ConfigKeys.train:
             OfaPreprocessor(
@@ -41,7 +42,7 @@ class OFATrainer(EpochBasedTrainer):
         # use torchrun launch
         world_size = int(os.environ.get('WORLD_SIZE', 1))
         epoch_steps = math.ceil(
-            len(dataset['train']) /  # noqa
+            len(train_dataset) /  # noqa
             (cfg.train.dataloader.batch_size_per_gpu * world_size))  # noqa
         cfg.train.lr_scheduler.num_train_steps = epoch_steps * cfg.train.max_epochs
         cfg.train.criterion.tokenizer = model.tokenizer
@@ -68,11 +69,11 @@ class OFATrainer(EpochBasedTrainer):
             cfg_file=cfg_file,
             model=model,
             data_collator=collator,
-            train_dataset=dataset['train'],
-            eval_dataset=dataset['valid'],
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
             preprocessor=preprocessor,
             optimizers=(optimizer, lr_scheduler),
-            work_dir=cfg.train.work_dir,
+            work_dir=work_dir,
             *args,
             **kwargs,
         )
diff --git a/tests/trainers/test_ofa_trainer.py b/tests/trainers/test_ofa_trainer.py
index 8aab3544..3322271d 100644
--- a/tests/trainers/test_ofa_trainer.py
+++ b/tests/trainers/test_ofa_trainer.py
@@ -3,22 +3,51 @@ import glob
 import os
 import os.path as osp
 import shutil
+import tempfile
 import unittest
 
 from modelscope.metainfo import Trainers
+from modelscope.msdatasets import MsDataset
 from modelscope.trainers import build_trainer
+from modelscope.utils.constant import DownloadMode
 from modelscope.utils.test_utils import test_level
 
 
 class TestOfaTrainer(unittest.TestCase):
 
+    def setUp(self):
+        column_map = {'premise': 'text', 'hypothesis': 'text2'}
+        data_train = MsDataset.load(
+            dataset_name='glue',
+            subset_name='mnli',
+            namespace='modelscope',
+            split='train[:100]',
+            download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS)
+        self.train_dataset = MsDataset.from_hf_dataset(
+            data_train._hf_ds.rename_columns(column_map))
+        data_eval = MsDataset.load(
+            dataset_name='glue',
+            subset_name='mnli',
+            namespace='modelscope',
+            split='validation_matched[:8]',
+            download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS)
+        self.test_dataset = MsDataset.from_hf_dataset(
+            data_eval._hf_ds.rename_columns(column_map))
+
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_trainer(self):
         os.environ['LOCAL_RANK'] = '0'
         model_id = 'damo/ofa_text-classification_mnli_large_en'
-        default_args = {'model': model_id}
-        trainer = build_trainer(
-            name=Trainers.ofa_tasks, default_args=default_args)
+
+        kwargs = dict(
+            model=model_id,
+            cfg_file=
+            '/Users/running_you/.cache/modelscope/hub/damo/ofa_text-classification_mnli_large_en//configuration.json',
+            train_dataset=self.train_dataset,
+            eval_dataset=self.test_dataset,
+            work_dir='/Users/running_you/.cache/modelscope/hub/work/mnli')
+
+        trainer = build_trainer(name=Trainers.ofa_tasks, default_args=kwargs)
         os.makedirs(trainer.work_dir, exist_ok=True)
         trainer.train()
         assert len(

From 55fb3b05a91107dd083d9684ee22906d406338e2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=A1=8C=E5=97=94?= <yichang.zyc@alibaba-inc.com>
Date: Sun, 23 Oct 2022 21:29:17 +0800
Subject: [PATCH 19/54] format finetune code, and ut case

---
 modelscope/metainfo.py                        |   4 +-
 modelscope/metrics/bleu_metric.py             |   2 +-
 modelscope/metrics/builder.py                 |   1 +
 modelscope/preprocessors/multi_modal.py       |  14 +-
 modelscope/preprocessors/ofa/base.py          |  16 ++
 .../preprocessors/ofa/image_captioning.py     |  14 +-
 .../preprocessors/ofa/ocr_recognition.py      |   4 +-
 .../preprocessors/ofa/utils/constant.py       |  13 ++
 .../trainers/multi_modal/ofa/ofa_trainer.py   | 137 +++++++++++-------
 modelscope/utils/constant.py                  |   1 +
 tests/trainers/test_ofa_trainer.py            | 103 +++++++++++--
 11 files changed, 215 insertions(+), 94 deletions(-)
 create mode 100644 modelscope/preprocessors/ofa/utils/constant.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index ac3fb4e2..b559f5c0 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -377,7 +377,7 @@ class Metrics(object):
     audio_noise_metric = 'audio-noise-metric'
 
     # text gen
-    bleu = 'bleu'
+    BLEU = 'bleu'
 
     # metrics for image denoise task
     image_denoise_metric = 'image-denoise-metric'
@@ -399,6 +399,8 @@ class Metrics(object):
     movie_scene_segmentation_metric = 'movie-scene-segmentation-metric'
     # metric for inpainting task
     image_inpainting_metric = 'image-inpainting-metric'
+    # metric for ocr
+    NED = 'ned'
 
 
 class Optimizers(object):
diff --git a/modelscope/metrics/bleu_metric.py b/modelscope/metrics/bleu_metric.py
index 43d1b105..7c134b6a 100644
--- a/modelscope/metrics/bleu_metric.py
+++ b/modelscope/metrics/bleu_metric.py
@@ -11,7 +11,7 @@ from .builder import METRICS, MetricKeys
 EVAL_BLEU_ORDER = 4
 
 
-@METRICS.register_module(group_key=default_group, module_name=Metrics.bleu)
+@METRICS.register_module(group_key=default_group, module_name=Metrics.BLEU)
 class BleuMetric(Metric):
     """The metric computation bleu for text generation classes.
 
diff --git a/modelscope/metrics/builder.py b/modelscope/metrics/builder.py
index 1c8e16d7..da3b64c7 100644
--- a/modelscope/metrics/builder.py
+++ b/modelscope/metrics/builder.py
@@ -23,6 +23,7 @@ class MetricKeys(object):
     BLEU_4 = 'bleu-4'
     ROUGE_1 = 'rouge-1'
     ROUGE_L = 'rouge-l'
+    NED = 'ned'  # ocr metric
 
 
 task_default_metrics = {
diff --git a/modelscope/preprocessors/multi_modal.py b/modelscope/preprocessors/multi_modal.py
index 2447c0b5..3c4ac58a 100644
--- a/modelscope/preprocessors/multi_modal.py
+++ b/modelscope/preprocessors/multi_modal.py
@@ -16,6 +16,7 @@ from .base import Preprocessor
 from .builder import PREPROCESSORS
 from .ofa import *  # noqa
 from .ofa.utils.collate import collate_fn
+from .ofa.utils.constant import OFA_TASK_KEY_MAPPING
 
 __all__ = [
     'OfaPreprocessor',
@@ -51,24 +52,13 @@ class OfaPreprocessor(Preprocessor):
             Tasks.text_summarization: OfaSummarizationPreprocessor,
             Tasks.text_to_image_synthesis: OfaTextToImageSynthesisPreprocessor
         }
-        input_key_mapping = {
-            Tasks.ocr_recognition: ['image'],
-            Tasks.image_captioning: ['image'],
-            Tasks.image_classification: ['image'],
-            Tasks.text_summarization: ['text'],
-            Tasks.text_classification: ['text', 'text2'],
-            Tasks.visual_grounding: ['image', 'text'],
-            Tasks.visual_question_answering: ['image', 'text'],
-            Tasks.visual_entailment: ['image', 'text', 'text2'],
-            Tasks.text_to_image_synthesis: ['text']
-        }
         model_dir = model_dir if osp.exists(model_dir) else snapshot_download(
             model_dir)
         self.cfg = Config.from_file(
             osp.join(model_dir, ModelFile.CONFIGURATION))
         self.preprocess = preprocess_mapping[self.cfg.task](
             cfg=self.cfg, model_dir=model_dir, mode=mode)
-        self.keys = input_key_mapping[self.cfg.task]
+        self.keys = OFA_TASK_KEY_MAPPING[self.cfg.task]
         self.tokenizer = self.preprocess.tokenizer
         if kwargs.get('no_collate', None):
             self.no_collate = True
diff --git a/modelscope/preprocessors/ofa/base.py b/modelscope/preprocessors/ofa/base.py
index 47d70f6d..55b3895d 100644
--- a/modelscope/preprocessors/ofa/base.py
+++ b/modelscope/preprocessors/ofa/base.py
@@ -6,9 +6,12 @@ from os import path as osp
 import json
 import numpy as np
 import torch
+from PIL import Image
 
 from modelscope.models.multi_modal.ofa import OFATokenizer, OFATokenizerZH
+from modelscope.preprocessors.image import load_image
 from modelscope.utils.trie import Trie
+from .utils.constant import OFA_TASK_KEY_MAPPING
 from .utils.random_help import set_torch_seed
 
 
@@ -59,6 +62,14 @@ class OfaBasePreprocessor:
             self.mean = [0.5, 0.5, 0.5]
             self.std = [0.5, 0.5, 0.5]
         self.patch_image_size = self.cfg.model.get('patch_image_size', 480)
+        self.column_map = {
+            key: key
+            for key in OFA_TASK_KEY_MAPPING[self.cfg.task]
+        }
+        if hasattr(self.cfg,
+                   'dataset') and self.cfg.dataset.column_map is not None:
+            for k, v in self.cfg.dataset.column_map.items():
+                self.column_map[k] = v
         self.transtab = str.maketrans(
             {key: None
              for key in string.punctuation})
@@ -147,3 +158,8 @@ class OfaBasePreprocessor:
                     constraint_prefix_token)
                 constraint_mask[i][constraint_nodes] = True
             sample['constraint_mask'] = constraint_mask
+
+    def get_img_pil(self, path_or_url_or_pil):
+        image = path_or_url_or_pil if isinstance(path_or_url_or_pil, Image.Image) \
+            else load_image(path_or_url_or_pil)
+        return image
diff --git a/modelscope/preprocessors/ofa/image_captioning.py b/modelscope/preprocessors/ofa/image_captioning.py
index 6c842aa9..99eda15d 100644
--- a/modelscope/preprocessors/ofa/image_captioning.py
+++ b/modelscope/preprocessors/ofa/image_captioning.py
@@ -1,12 +1,9 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-import os
-from typing import Any, Dict, Union
+from typing import Any, Dict
 
 import torch
-from PIL import Image
 from torchvision import transforms
 
-from modelscope.preprocessors.image import load_image
 from modelscope.utils.constant import ModeKeys
 from .base import OfaBasePreprocessor
 
@@ -46,7 +43,7 @@ class OfaImageCaptioningPreprocessor(OfaBasePreprocessor):
 
     def _build_train_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
         sample = self._build_infer_sample(data)
-        target = data['text']
+        target = data[self.column_map['text']]
         target = target.translate(self.transtab).strip()
         target_token_list = target.strip().split()
         target = ' '.join(target_token_list[:self.max_tgt_length])
@@ -56,8 +53,7 @@ class OfaImageCaptioningPreprocessor(OfaBasePreprocessor):
         return sample
 
     def _build_infer_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
-        image = data['image'] if isinstance(
-            data['image'], Image.Image) else load_image(data['image'])
+        image = self.get_img_pil(data[self.column_map['image']])
         patch_image = self.patch_resize_transform(image)
         prompt = self.cfg.model.get('prompt', ' what does the image describe?')
         inputs = self.tokenize_text(prompt)
@@ -66,6 +62,6 @@ class OfaImageCaptioningPreprocessor(OfaBasePreprocessor):
             'patch_image': patch_image,
             'patch_mask': torch.tensor([True])
         }
-        if 'text' in data:
-            sample['label'] = data['text']
+        if self.column_map['text'] in data:
+            sample['label'] = data[self.column_map['text']]
         return sample
diff --git a/modelscope/preprocessors/ofa/ocr_recognition.py b/modelscope/preprocessors/ofa/ocr_recognition.py
index 1d30e572..4c8c245a 100644
--- a/modelscope/preprocessors/ofa/ocr_recognition.py
+++ b/modelscope/preprocessors/ofa/ocr_recognition.py
@@ -1,7 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-import random
-import unicodedata
-from typing import Any, Dict, Union
+from typing import Any, Dict
 
 import torch
 from PIL import Image
diff --git a/modelscope/preprocessors/ofa/utils/constant.py b/modelscope/preprocessors/ofa/utils/constant.py
new file mode 100644
index 00000000..102d27c0
--- /dev/null
+++ b/modelscope/preprocessors/ofa/utils/constant.py
@@ -0,0 +1,13 @@
+from modelscope.utils.constant import Tasks
+
+OFA_TASK_KEY_MAPPING = {
+    Tasks.ocr_recognition: ['image'],
+    Tasks.image_captioning: ['image'],
+    Tasks.image_classification: ['image'],
+    Tasks.text_summarization: ['text'],
+    Tasks.text_classification: ['text', 'text2'],
+    Tasks.visual_grounding: ['image', 'text'],
+    Tasks.visual_question_answering: ['image', 'text'],
+    Tasks.visual_entailment: ['image', 'text', 'text2'],
+    Tasks.text_to_image_synthesis: ['text']
+}
diff --git a/modelscope/trainers/multi_modal/ofa/ofa_trainer.py b/modelscope/trainers/multi_modal/ofa/ofa_trainer.py
index 3daadf43..c287c182 100644
--- a/modelscope/trainers/multi_modal/ofa/ofa_trainer.py
+++ b/modelscope/trainers/multi_modal/ofa/ofa_trainer.py
@@ -2,21 +2,27 @@
 
 import math
 import os
+import shutil
 from functools import partial
+from typing import Callable, Dict, Optional, Tuple, Union
 
-from datasets import load_dataset
+import torch
 from torch import distributed as dist
+from torch import nn
+from torch.utils.data import Dataset
 
 from modelscope.metainfo import Trainers
-from modelscope.models.base import Model
+from modelscope.models.base import Model, TorchModel
 from modelscope.msdatasets.ms_dataset import MsDataset
+from modelscope.preprocessors.base import Preprocessor
 from modelscope.preprocessors.multi_modal import OfaPreprocessor
 from modelscope.preprocessors.ofa.utils.collate import collate_fn
 from modelscope.trainers import EpochBasedTrainer
 from modelscope.trainers.builder import TRAINERS
 from modelscope.trainers.optimizer.builder import build_optimizer
 from modelscope.utils.config import Config
-from modelscope.utils.constant import ConfigKeys, ModeKeys, ModelFile
+from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, ConfigKeys,
+                                       ModeKeys)
 from .ofa_trainer_utils import (AdjustLabelSmoothedCrossEntropyCriterion,
                                 get_schedule)
 
@@ -24,56 +30,100 @@ from .ofa_trainer_utils import (AdjustLabelSmoothedCrossEntropyCriterion,
 @TRAINERS.register_module(module_name=Trainers.ofa_tasks)
 class OFATrainer(EpochBasedTrainer):
 
-    def __init__(self, model: str, *args, **kwargs):
-        model = Model.from_pretrained(model)
+    def __init__(
+            self,
+            model: Optional[Union[TorchModel, nn.Module, str]] = None,
+            cfg_file: Optional[str] = None,
+            arg_parse_fn: Optional[Callable] = None,
+            data_collator: Optional[Union[Callable, Dict[str,
+                                                         Callable]]] = None,
+            train_dataset: Optional[Union[MsDataset, Dataset]] = None,
+            eval_dataset: Optional[Union[MsDataset, Dataset]] = None,
+            preprocessor: Optional[Union[Preprocessor,
+                                         Dict[str, Preprocessor]]] = None,
+            optimizers: Tuple[torch.optim.Optimizer,
+                              torch.optim.lr_scheduler._LRScheduler] = (None,
+                                                                        None),
+            model_revision: Optional[str] = DEFAULT_MODEL_REVISION,
+            seed: int = 42,
+            **kwargs):
+        model = Model.from_pretrained(model, revision=model_revision)
         model_dir = model.model_dir
-        cfg_file = os.path.join(model_dir, ModelFile.CONFIGURATION)
         cfg = Config.from_file(cfg_file)
-        dataset = self._build_dataset_with_config(cfg)
-        preprocessor = {
-            ConfigKeys.train:
-            OfaPreprocessor(
-                model_dir=model_dir, mode=ModeKeys.TRAIN, no_collate=True),
-            ConfigKeys.val:
-            OfaPreprocessor(
-                model_dir=model_dir, mode=ModeKeys.EVAL, no_collate=True),
+        if 'work_dir' not in kwargs or len(kwargs['work_dir']) == 0:
+            work_dir = cfg.train.work_dir
+        else:
+            work_dir = kwargs['work_dir']
+        tokenizer_files = {
+            'zh': [
+                'tokenizer.json', 'tokenizer_config.json', 'vocab.txt',
+                'config.json'
+            ],
+            'en':
+            ['tokenizer.json', 'vocab.json', 'merges.txt', 'config.json'],
         }
+        for filename in tokenizer_files[cfg.model.get('language', 'en')]:
+            finetune_file = os.path.join(work_dir, filename)
+            pretrain_file = os.path.join(model_dir, filename)
+            if os.path.exists(finetune_file):
+                continue
+            if os.path.exists(pretrain_file):
+                shutil.copy(pretrain_file, finetune_file)
+
+        if preprocessor is None:
+            preprocessor = {
+                ConfigKeys.train:
+                OfaPreprocessor(
+                    model_dir=work_dir, mode=ModeKeys.TRAIN, no_collate=True),
+                ConfigKeys.val:
+                OfaPreprocessor(
+                    model_dir=work_dir, mode=ModeKeys.EVAL, no_collate=True),
+            }
         # use torchrun launch
         world_size = int(os.environ.get('WORLD_SIZE', 1))
         epoch_steps = math.ceil(
-            len(dataset['train']) /  # noqa
+            len(train_dataset) /  # noqa
             (cfg.train.dataloader.batch_size_per_gpu * world_size))  # noqa
         cfg.train.lr_scheduler.num_train_steps = epoch_steps * cfg.train.max_epochs
         cfg.train.criterion.tokenizer = model.tokenizer
         self.criterion = AdjustLabelSmoothedCrossEntropyCriterion(
             cfg.train.criterion)
-        optimizer = build_optimizer(model, cfg=cfg.train.optimizer)
-        scheduler_class, scheduler_args = get_schedule(cfg.train.lr_scheduler)
-        if scheduler_class is not None:
-            lr_scheduler = scheduler_class(**{'optimizer': optimizer},
-                                           **scheduler_args)
+        if optimizers[0] is None:
+            optimizer = build_optimizer(model, cfg=cfg.train.optimizer)
         else:
-            lr_scheduler = None
-        collator = partial(
-            collate_fn,
-            pad_idx=model.tokenizer.pad_token_id,
-            eos_idx=model.tokenizer.eos_token_id,
-        )
+            optimizer = optimizers[0]
+        if optimizers[1] is None:
+            scheduler_class, scheduler_args = get_schedule(
+                cfg.train.lr_scheduler)
+            if scheduler_class is not None:
+                lr_scheduler = scheduler_class(**{'optimizer': optimizer},
+                                               **scheduler_args)
+            else:
+                lr_scheduler = None
+        else:
+            lr_scheduler = optimizers[1]
+        optimizers = (optimizer, lr_scheduler)
+        if data_collator is None:
+            data_collator = partial(
+                collate_fn,
+                pad_idx=model.tokenizer.pad_token_id,
+                eos_idx=model.tokenizer.eos_token_id,
+            )
         if 'launcher' not in kwargs and cfg.train.get('launcher', None):
             kwargs['launcher'] = cfg.train.launcher
         if 'use_fp16' not in kwargs and cfg.train.get('use_fp16', False):
             kwargs['use_fp16'] = cfg.train.use_fp16
         kwargs['to_tensor'] = False
         super().__init__(
-            cfg_file=cfg_file,
             model=model,
-            data_collator=collator,
-            train_dataset=dataset['train'],
-            eval_dataset=dataset['valid'],
+            cfg_file=cfg_file,
+            arg_parse_fn=arg_parse_fn,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
             preprocessor=preprocessor,
-            optimizers=(optimizer, lr_scheduler),
-            work_dir=cfg.train.work_dir,
-            *args,
+            optimizers=optimizers,
+            seed=seed,
             **kwargs,
         )
 
@@ -102,24 +152,3 @@ class OFATrainer(EpochBasedTrainer):
         else:
             self.log_buffer.update(train_outputs['log_vars'])
         self.train_outputs = train_outputs
-
-    def _build_dataset_with_config(self, cfg):
-        if hasattr(cfg.dataset, 'hf_dataset'):
-            dataset = load_dataset(
-                cfg.dataset.script,
-                data_files=cfg.dataset.hf_dataset,
-                sep=cfg.dataset.sep,
-            )
-            dataset = MsDataset.from_hf_dataset(
-                dataset.rename_columns(cfg.dataset.column_map))
-            return dataset
-        elif hasattr(cfg.dataset, 'ms_dataset'):
-            dataset_d = dict()
-            for key in cfg.dataset.ms_dataset.keys():
-                dataset_d[key] = MsDataset.load(**cfg.dataset.ms_dataset[key])
-                dataset_d[key] = MsDataset.from_hf_dataset(
-                    dataset_d[key]._hf_ds.rename_columns(
-                        cfg.dataset.column_map))
-            return dataset_d
-        else:
-            raise NotImplementedError
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 87a0a417..a3f4a935 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -282,6 +282,7 @@ class ConfigKeys(object):
     """Fixed keywords in configuration file"""
     train = 'train'
     val = 'val'
+    test = 'test'
 
 
 class Requirements(object):
diff --git a/tests/trainers/test_ofa_trainer.py b/tests/trainers/test_ofa_trainer.py
index 8aab3544..fe7672df 100644
--- a/tests/trainers/test_ofa_trainer.py
+++ b/tests/trainers/test_ofa_trainer.py
@@ -5,27 +5,102 @@ import os.path as osp
 import shutil
 import unittest
 
-from modelscope.metainfo import Trainers
+import json
+
+from modelscope.metainfo import Metrics, Trainers
+from modelscope.msdatasets import MsDataset
 from modelscope.trainers import build_trainer
+from modelscope.utils.constant import ModelFile
 from modelscope.utils.test_utils import test_level
 
 
 class TestOfaTrainer(unittest.TestCase):
 
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
-    def test_trainer(self):
-        os.environ['LOCAL_RANK'] = '0'
-        model_id = 'damo/ofa_text-classification_mnli_large_en'
-        default_args = {'model': model_id}
-        trainer = build_trainer(
-            name=Trainers.ofa_tasks, default_args=default_args)
-        os.makedirs(trainer.work_dir, exist_ok=True)
+    def setUp(self) -> None:
+        self.finetune_cfg = \
+            {'framework': 'pytorch',
+             'task': 'image-captioning',
+             'model': {'type': 'ofa',
+                       'beam_search': {'beam_size': 5,
+                                       'max_len_b': 16,
+                                       'min_len': 1,
+                                       'no_repeat_ngram_size': 0},
+                       'seed': 7,
+                       'max_src_length': 256,
+                       'language': 'en',
+                       'gen_type': 'generation',
+                       'patch_image_size': 480,
+                       'max_image_size': 480,
+                       'imagenet_default_mean_and_std': False},
+             'pipeline': {'type': 'image-captioning'},
+             'dataset': {'column_map': {'text': 'caption'}},
+             'train': {'work_dir': 'work/ckpts/caption',
+                       # 'launcher': 'pytorch',
+                       'max_epochs': 1,
+                       'use_fp16': True,
+                       'dataloader': {'batch_size_per_gpu': 4, 'workers_per_gpu': 0},
+                       'lr_scheduler': {'name': 'polynomial_decay',
+                                        'warmup_proportion': 0.01,
+                                        'lr_end': 1e-07},
+                       'lr_scheduler_hook': {'type': 'LrSchedulerHook', 'by_epoch': False},
+                       'optimizer': {'type': 'AdamW', 'lr': 5e-05, 'weight_decay': 0.01},
+                       'optimizer_hook': {'type': 'TorchAMPOptimizerHook',
+                                          'cumulative_iters': 1,
+                                          'grad_clip': {'max_norm': 1.0, 'norm_type': 2},
+                                          'loss_keys': 'loss'},
+                       'criterion': {'name': 'AdjustLabelSmoothedCrossEntropyCriterion',
+                                     'constraint_range': None,
+                                     'drop_worst_after': 0,
+                                     'drop_worst_ratio': 0.0,
+                                     'ignore_eos': False,
+                                     'ignore_prefix_size': 0,
+                                     'label_smoothing': 0.0,
+                                     'reg_alpha': 1.0,
+                                     'report_accuracy': False,
+                                     'sample_patch_num': 196,
+                                     'sentence_avg': False,
+                                     'use_rdrop': False},
+                       'hooks': [{'type': 'BestCkptSaverHook',
+                                  'metric_key': 'bleu-4',
+                                  'interval': 100},
+                                 {'type': 'TextLoggerHook', 'interval': 1},
+                                 {'type': 'IterTimerHook'},
+                                 {'type': 'EvaluationHook', 'by_epoch': True, 'interval': 1}]},
+             'evaluation': {'dataloader': {'batch_size_per_gpu': 4, 'workers_per_gpu': 0},
+                            'metrics': [{'type': 'bleu',
+                                         'eval_tokenized_bleu': False,
+                                         'ref_name': 'labels',
+                                         'hyp_name': 'caption'}]},
+             'preprocessor': []}
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_trainer_std(self):
+        WORKSPACE = './workspace/ckpts/caption'
+        os.makedirs(WORKSPACE, exist_ok=True)
+        config_file = os.path.join(WORKSPACE, 'configuration.json')
+        with open(config_file, 'w') as writer:
+            json.dump(self.finetune_cfg, writer)
+
+        pretrained_model = '/apsarapangu/disk2/yichang.zyc/ckpt/MaaS/ofa_image-caption_coco_large_en'
+        args = dict(
+            model=pretrained_model,
+            work_dir=WORKSPACE,
+            train_dataset=MsDataset.load(
+                'coco_2014_caption',
+                namespace='modelscope',
+                split='train[:100]'),
+            eval_dataset=MsDataset.load(
+                'coco_2014_caption',
+                namespace='modelscope',
+                split='validation[:20]'),
+            metrics=[Metrics.BLEU],
+            cfg_file=config_file)
+        trainer = build_trainer(name=Trainers.ofa_tasks, default_args=args)
         trainer.train()
-        assert len(
-            glob.glob(osp.join(trainer.work_dir,
-                               'best_epoch*_accuracy*.pth'))) == 2
-        if os.path.exists(self.trainer.work_dir):
-            shutil.rmtree(self.trainer.work_dir)
+
+        self.assertIn(ModelFile.TORCH_MODEL_BIN_FILE,
+                      os.path.join(WORKSPACE, 'output'))
+        shutil.rmtree(WORKSPACE)
 
 
 if __name__ == '__main__':

From 1682ea7dec8bcf6fb9bc6c26fc6a915a8dc0ba8c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=A1=8C=E5=97=94?= <yichang.zyc@alibaba-inc.com>
Date: Mon, 24 Oct 2022 18:07:18 +0800
Subject: [PATCH 20/54] fix local path

---
 tests/trainers/test_ofa_trainer.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/tests/trainers/test_ofa_trainer.py b/tests/trainers/test_ofa_trainer.py
index fe7672df..f21cb3da 100644
--- a/tests/trainers/test_ofa_trainer.py
+++ b/tests/trainers/test_ofa_trainer.py
@@ -1,7 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-import glob
 import os
-import os.path as osp
 import shutil
 import unittest
 
@@ -54,7 +52,7 @@ class TestOfaTrainer(unittest.TestCase):
                                      'drop_worst_ratio': 0.0,
                                      'ignore_eos': False,
                                      'ignore_prefix_size': 0,
-                                     'label_smoothing': 0.0,
+                                     'label_smoothing': 0.1,
                                      'reg_alpha': 1.0,
                                      'report_accuracy': False,
                                      'sample_patch_num': 196,
@@ -77,11 +75,11 @@ class TestOfaTrainer(unittest.TestCase):
     def test_trainer_std(self):
         WORKSPACE = './workspace/ckpts/caption'
         os.makedirs(WORKSPACE, exist_ok=True)
-        config_file = os.path.join(WORKSPACE, 'configuration.json')
+        config_file = os.path.join(WORKSPACE, ModelFile.CONFIGURATION)
         with open(config_file, 'w') as writer:
             json.dump(self.finetune_cfg, writer)
 
-        pretrained_model = '/apsarapangu/disk2/yichang.zyc/ckpt/MaaS/ofa_image-caption_coco_large_en'
+        pretrained_model = 'damo/ofa_image-caption_coco_large_en'
         args = dict(
             model=pretrained_model,
             work_dir=WORKSPACE,

From 1ecf588c862b6a19242b88ae41868eb9fbc5a118 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=A1=8C=E5=97=94?= <yichang.zyc@alibaba-inc.com>
Date: Mon, 24 Oct 2022 20:56:58 +0800
Subject: [PATCH 21/54] update finetune

---
 modelscope/metrics/ciderD/__init__.py         |   1 +
 modelscope/metrics/ciderD/ciderD.py           |  57 +++++
 modelscope/metrics/ciderD/ciderD_scorer.py    | 233 ++++++++++++++++++
 .../multi_modal/ofa/ofa_trainer_utils.py      |   4 +-
 4 files changed, 293 insertions(+), 2 deletions(-)
 create mode 100755 modelscope/metrics/ciderD/__init__.py
 create mode 100755 modelscope/metrics/ciderD/ciderD.py
 create mode 100755 modelscope/metrics/ciderD/ciderD_scorer.py

diff --git a/modelscope/metrics/ciderD/__init__.py b/modelscope/metrics/ciderD/__init__.py
new file mode 100755
index 00000000..3f7d85bb
--- /dev/null
+++ b/modelscope/metrics/ciderD/__init__.py
@@ -0,0 +1 @@
+__author__ = 'tylin'
diff --git a/modelscope/metrics/ciderD/ciderD.py b/modelscope/metrics/ciderD/ciderD.py
new file mode 100755
index 00000000..05c7eb23
--- /dev/null
+++ b/modelscope/metrics/ciderD/ciderD.py
@@ -0,0 +1,57 @@
+# Filename: ciderD.py
+#
+# Description: Describes the class to compute the CIDEr-D (Consensus-Based Image Description Evaluation) Metric
+#               by Vedantam, Zitnick, and Parikh (http://arxiv.org/abs/1411.5726)
+#
+# Creation Date: Sun Feb  8 14:16:54 2015
+#
+# Authors: Ramakrishna Vedantam <vrama91@vt.edu> and Tsung-Yi Lin <tl483@cornell.edu>
+from __future__ import absolute_import, division, print_function
+
+from .ciderD_scorer import CiderScorer
+
+
+class CiderD:
+    """
+    Main Class to compute the CIDEr metric
+
+    """
+
+    def __init__(self, n=4, sigma=6.0, df='corpus'):
+        # set cider to sum over 1 to 4-grams
+        self._n = n
+        # set the standard deviation parameter for gaussian penalty
+        self._sigma = sigma
+        # set which where to compute document frequencies from
+        self._df = df
+        self.cider_scorer = CiderScorer(n=self._n, df_mode=self._df)
+
+    def compute_score(self, gts, res):
+        """
+        Main function to compute CIDEr score
+        :param  hypo_for_image (dict) : dictionary with key <image> and value <tokenized hypothesis / candidate sentence>
+                ref_for_image (dict)  : dictionary with key <image> and value <tokenized reference sentence>
+        :return: cider (float) : computed CIDEr score for the corpus
+        """ # noqa
+
+        # clear all the previous hypos and refs
+        tmp_cider_scorer = self.cider_scorer.copy_empty()
+        tmp_cider_scorer.clear()
+        for res_id in res:
+
+            hypo = res_id['caption']
+            ref = gts[res_id['image_id']]
+
+            # Sanity check.
+            assert (type(hypo) is list)
+            assert (len(hypo) == 1)
+            assert (type(ref) is list)
+            assert (len(ref) > 0)
+            tmp_cider_scorer += (hypo[0], ref)
+
+        (score, scores) = tmp_cider_scorer.compute_score()
+
+        return score, scores
+
+    def method(self):
+        return 'CIDEr-D'
diff --git a/modelscope/metrics/ciderD/ciderD_scorer.py b/modelscope/metrics/ciderD/ciderD_scorer.py
new file mode 100755
index 00000000..4157ec11
--- /dev/null
+++ b/modelscope/metrics/ciderD/ciderD_scorer.py
@@ -0,0 +1,233 @@
+#!/usr/bin/env python
+# Tsung-Yi Lin <tl483@cornell.edu>
+# Ramakrishna Vedantam <vrama91@vt.edu>
+from __future__ import absolute_import, division, print_function
+import copy
+import math
+import os
+import pdb
+from collections import defaultdict
+
+import numpy as np
+import six
+from six.moves import cPickle
+
+
+def precook(s, n=4, out=False):
+    """
+    Takes a string as input and returns an object that can be given to
+    either cook_refs or cook_test. This is optional: cook_refs and cook_test
+    can take string arguments as well.
+    :param s: string : sentence to be converted into ngrams
+    :param n: int    : number of ngrams for which representation is calculated
+    :return: term frequency vector for occuring ngrams
+    """
+    words = s.split()
+    counts = defaultdict(int)
+    for k in range(1, n + 1):
+        for i in range(len(words) - k + 1):
+            ngram = tuple(words[i:i + k])
+            counts[ngram] += 1
+    return counts
+
+
+def cook_refs(refs, n=4):  # lhuang: oracle will call with "average"
+    '''Takes a list of reference sentences for a single segment
+    and returns an object that encapsulates everything that BLEU
+    needs to know about them.
+    :param refs: list of string : reference sentences for some image
+    :param n: int : number of ngrams for which (ngram) representation is calculated
+    :return: result (list of dict)
+    '''
+    return [precook(ref, n) for ref in refs]
+
+
+def cook_test(test, n=4):
+    '''Takes a test sentence and returns an object that
+    encapsulates everything that BLEU needs to know about it.
+    :param test: list of string : hypothesis sentence for some image
+    :param n: int : number of ngrams for which (ngram) representation is calculated
+    :return: result (dict)
+    '''
+    return precook(test, n, True)
+
+
+class CiderScorer(object):
+    """CIDEr scorer.
+    """
+
+    def copy(self):
+        ''' copy the refs.'''
+        new = CiderScorer(n=self.n)
+        new.ctest = copy.copy(self.ctest)
+        new.crefs = copy.copy(self.crefs)
+        return new
+
+    def copy_empty(self):
+        new = CiderScorer(df_mode='corpus', n=self.n, sigma=self.sigma)
+        new.df_mode = self.df_mode
+        new.ref_len = self.ref_len
+        new.document_frequency = self.document_frequency
+        return new
+
+    def __init__(self, df_mode='corpus', test=None, refs=None, n=4, sigma=6.0):
+        ''' singular instance '''
+        self.n = n
+        self.sigma = sigma
+        self.crefs = []
+        self.ctest = []
+        self.df_mode = df_mode
+        self.ref_len = None
+        if self.df_mode != 'corpus':
+            pkl_file = cPickle.load(
+                open(df_mode, 'rb'),
+                **(dict(encoding='latin1') if six.PY3 else {}))
+            self.ref_len = np.log(float(pkl_file['ref_len']))
+            self.document_frequency = pkl_file['document_frequency']
+        else:
+            self.document_frequency = None
+        self.cook_append(test, refs)
+
+    def clear(self):
+        self.crefs = []
+        self.ctest = []
+
+    def cook_append(self, test, refs):
+        '''called by constructor and __iadd__ to avoid creating new instances.'''
+
+        if refs is not None:
+            self.crefs.append(cook_refs(refs))
+            if test is not None:
+                self.ctest.append(cook_test(test))  # N.B.: -1
+            else:
+                self.ctest.append(
+                    None)  # lens of crefs and ctest have to match
+
+    def size(self):
+        assert len(self.crefs) == len(
+            self.ctest), 'refs/test mismatch! %d<>%d' % (len(
+                self.crefs), len(self.ctest))
+        return len(self.crefs)
+
+    def __iadd__(self, other):
+        '''add an instance (e.g., from another sentence).'''
+
+        if type(other) is tuple:
+            # avoid creating new CiderScorer instances
+            self.cook_append(other[0], other[1])
+        else:
+            self.ctest.extend(other.ctest)
+            self.crefs.extend(other.crefs)
+
+        return self
+
+    def compute_doc_freq(self):
+        """
+        Compute term frequency for reference data.
+        This will be used to compute idf (inverse document frequency later)
+        The term frequency is stored in the object
+        :return: None
+        """
+        for refs in self.crefs:
+            # refs, k ref captions of one image
+            for ngram in set([
+                    ngram for ref in refs for (ngram, count) in ref.items()
+            ]):  # noqa
+                self.document_frequency[ngram] += 1
+
+    def compute_cider(self):
+
+        def counts2vec(cnts):
+            """
+            Function maps counts of ngram to vector of tfidf weights.
+            The function returns vec, an array of dictionary that store mapping of n-gram and tf-idf weights.
+            The n-th entry of array denotes length of n-grams.
+            :param cnts:
+            :return: vec (array of dict), norm (array of float), length (int)
+            """
+            vec = [defaultdict(float) for _ in range(self.n)]
+            length = 0
+            norm = [0.0 for _ in range(self.n)]
+            for (ngram, term_freq) in cnts.items():
+                # give word count 1 if it doesn't appear in reference corpus
+                df = np.log(max(1.0, self.document_frequency[ngram]))
+                # ngram index
+                n = len(ngram) - 1
+                # tf (term_freq) * idf (precomputed idf) for n-grams
+                vec[n][ngram] = float(term_freq) * (self.ref_len - df)
+                # compute norm for the vector.  the norm will be used for computing similarity
+                norm[n] += pow(vec[n][ngram], 2)
+
+                if n == 1:
+                    length += term_freq
+            norm = [np.sqrt(n) for n in norm]
+            return vec, norm, length
+
+        def sim(vec_hyp, vec_ref, norm_hyp, norm_ref, length_hyp, length_ref):
+            '''
+            Compute the cosine similarity of two vectors.
+            :param vec_hyp: array of dictionary for vector corresponding to hypothesis
+            :param vec_ref: array of dictionary for vector corresponding to reference
+            :param norm_hyp: array of float for vector corresponding to hypothesis
+            :param norm_ref: array of float for vector corresponding to reference
+            :param length_hyp: int containing length of hypothesis
+            :param length_ref: int containing length of reference
+            :return: array of score for each n-grams cosine similarity
+            '''
+            delta = float(length_hyp - length_ref)
+            # measure consine similarity
+            val = np.array([0.0 for _ in range(self.n)])
+            for n in range(self.n):
+                # ngram
+                for (ngram, count) in vec_hyp[n].items():
+                    # vrama91 : added clipping
+                    val[n] += min(vec_hyp[n][ngram],
+                                  vec_ref[n][ngram]) * vec_ref[n][ngram]
+
+                if (norm_hyp[n] != 0) and (norm_ref[n] != 0):
+                    val[n] /= (norm_hyp[n] * norm_ref[n])
+
+                assert (not math.isnan(val[n]))
+                # vrama91: added a length based gaussian penalty
+                val[n] *= np.e**(-(delta**2) / (2 * self.sigma**2))
+            return val
+
+        # compute log reference length
+        if self.df_mode == 'corpus':
+            self.ref_len = np.log(float(len(self.crefs)))
+        # elif self.df_mode == "coco-val-df":
+        # if coco option selected, use length of coco-val set
+        #    self.ref_len = np.log(float(40504))
+
+        scores = []
+        for test, refs in zip(self.ctest, self.crefs):
+            # compute vector for test captions
+            vec, norm, length = counts2vec(test)
+            # compute vector for ref captions
+            score = np.array([0.0 for _ in range(self.n)])
+            for ref in refs:
+                vec_ref, norm_ref, length_ref = counts2vec(ref)
+                score += sim(vec, vec_ref, norm, norm_ref, length, length_ref)
+            # change by vrama91 - mean of ngram scores, instead of sum
+            score_avg = np.mean(score)
+            # divide by number of references
+            score_avg /= len(refs)
+            # multiply score by 10
+            score_avg *= 10.0
+            # append score of an image to the score list
+            scores.append(score_avg)
+        return scores
+
+    def compute_score(self, option=None, verbose=0):
+        # compute idf
+        if self.df_mode == 'corpus':
+            self.document_frequency = defaultdict(float)
+            self.compute_doc_freq()
+            # assert to check document frequency
+            assert (len(self.ctest) >= max(self.document_frequency.values()))
+            # import json for now and write the corresponding files
+        # compute cider score
+        score = self.compute_cider()
+        # debug
+        # print score
+        return np.mean(np.array(score)), np.array(score)
diff --git a/modelscope/trainers/multi_modal/ofa/ofa_trainer_utils.py b/modelscope/trainers/multi_modal/ofa/ofa_trainer_utils.py
index b2e54ec6..2189a5db 100644
--- a/modelscope/trainers/multi_modal/ofa/ofa_trainer_utils.py
+++ b/modelscope/trainers/multi_modal/ofa/ofa_trainer_utils.py
@@ -83,8 +83,8 @@ def label_smoothed_nll_loss(lprobs,
             lprobs = lprobs[indices]
 
     ntokens = loss.numel()
-    nll_loss = nll_loss.sum()
-    loss = loss.sum()
+    nll_loss = nll_loss.sum() / ntokens  # 后面在grads里面处理
+    loss = loss.sum() / ntokens  # 后面在grads里面处理
     if use_rdrop:
         true_batch_size = lprobs.size(0) // 2
         p = lprobs[:true_batch_size]

From 428599f3e571cdc0e4b862aa12157559a5c9cd97 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=A1=8C=E5=97=94?= <yichang.zyc@alibaba-inc.com>
Date: Mon, 24 Oct 2022 21:38:31 +0800
Subject: [PATCH 22/54] update finetune

---
 modelscope/preprocessors/ofa/image_captioning.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modelscope/preprocessors/ofa/image_captioning.py b/modelscope/preprocessors/ofa/image_captioning.py
index 99eda15d..af623297 100644
--- a/modelscope/preprocessors/ofa/image_captioning.py
+++ b/modelscope/preprocessors/ofa/image_captioning.py
@@ -62,6 +62,6 @@ class OfaImageCaptioningPreprocessor(OfaBasePreprocessor):
             'patch_image': patch_image,
             'patch_mask': torch.tensor([True])
         }
-        if self.column_map['text'] in data:
+        if 'text' in self.column_map and self.column_map['text'] in data:
             sample['label'] = data[self.column_map['text']]
         return sample

From 46c3bdcfe8bbf56d90d8fd1f278ded870e57c7be Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=A1=8C=E5=97=94?= <yichang.zyc@alibaba-inc.com>
Date: Mon, 24 Oct 2022 23:19:23 +0800
Subject: [PATCH 23/54] fix a bug

---
 modelscope/preprocessors/ofa/ocr_recognition.py | 16 ++++++++++++----
 tests/trainers/test_ofa_trainer.py              |  8 ++++----
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/modelscope/preprocessors/ofa/ocr_recognition.py b/modelscope/preprocessors/ofa/ocr_recognition.py
index 4c8c245a..1761dbd4 100644
--- a/modelscope/preprocessors/ofa/ocr_recognition.py
+++ b/modelscope/preprocessors/ofa/ocr_recognition.py
@@ -8,6 +8,7 @@ from torchvision.transforms import InterpolationMode
 from torchvision.transforms import functional as F
 
 from modelscope.preprocessors.image import load_image
+from modelscope.utils.constant import ModeKeys
 from .base import OfaBasePreprocessor
 
 IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
@@ -57,14 +58,21 @@ def ocr_resize(img, patch_image_size, is_document=False):
 
 class OfaOcrRecognitionPreprocessor(OfaBasePreprocessor):
 
-    def __init__(self, cfg, model_dir):
+    def __init__(self,
+                 cfg,
+                 model_dir,
+                 mode=ModeKeys.INFERENCE,
+                 *args,
+                 **kwargs):
         """preprocess the data
 
         Args:
             cfg(modelscope.utils.config.ConfigDict) : model config
-            model_dir (str): model path
+            model_dir (str): model path,
+            mode: preprocessor mode (model mode)
         """
-        super(OfaOcrRecognitionPreprocessor, self).__init__(cfg, model_dir)
+        super(OfaOcrRecognitionPreprocessor,
+              self).__init__(cfg, model_dir, mode, *args, **kwargs)
         # Initialize transform
         if self.cfg.model.imagenet_default_mean_and_std:
             mean = IMAGENET_DEFAULT_MEAN
@@ -87,7 +95,7 @@ class OfaOcrRecognitionPreprocessor(OfaBasePreprocessor):
             data['image'], Image.Image) else load_image(data['image'])
         patch_image = self.patch_resize_transform(image)
         prompt = self.cfg.model.get('prompt', '图片上的文字是什么?')
-        inputs = self.get_inputs(prompt)
+        inputs = self.tokenize_text(prompt)
 
         sample = {
             'source': inputs,
diff --git a/tests/trainers/test_ofa_trainer.py b/tests/trainers/test_ofa_trainer.py
index f21cb3da..894e67d2 100644
--- a/tests/trainers/test_ofa_trainer.py
+++ b/tests/trainers/test_ofa_trainer.py
@@ -36,10 +36,10 @@ class TestOfaTrainer(unittest.TestCase):
                        # 'launcher': 'pytorch',
                        'max_epochs': 1,
                        'use_fp16': True,
-                       'dataloader': {'batch_size_per_gpu': 4, 'workers_per_gpu': 0},
+                       'dataloader': {'batch_size_per_gpu': 1, 'workers_per_gpu': 0},
                        'lr_scheduler': {'name': 'polynomial_decay',
                                         'warmup_proportion': 0.01,
-                                        'lr_end': 1e-07},
+                                        'lr_endo': 1e-07},
                        'lr_scheduler_hook': {'type': 'LrSchedulerHook', 'by_epoch': False},
                        'optimizer': {'type': 'AdamW', 'lr': 5e-05, 'weight_decay': 0.01},
                        'optimizer_hook': {'type': 'TorchAMPOptimizerHook',
@@ -86,11 +86,11 @@ class TestOfaTrainer(unittest.TestCase):
             train_dataset=MsDataset.load(
                 'coco_2014_caption',
                 namespace='modelscope',
-                split='train[:100]'),
+                split='train[:20]'),
             eval_dataset=MsDataset.load(
                 'coco_2014_caption',
                 namespace='modelscope',
-                split='validation[:20]'),
+                split='validation[:10]'),
             metrics=[Metrics.BLEU],
             cfg_file=config_file)
         trainer = build_trainer(name=Trainers.ofa_tasks, default_args=args)

From 85a7832d575a3ade8210ff9a434df71862dbf16b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=A1=8C=E5=97=94?= <yichang.zyc@alibaba-inc.com>
Date: Tue, 25 Oct 2022 00:52:35 +0800
Subject: [PATCH 24/54] fix a typo

---
 tests/trainers/test_ofa_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/trainers/test_ofa_trainer.py b/tests/trainers/test_ofa_trainer.py
index 894e67d2..786599bb 100644
--- a/tests/trainers/test_ofa_trainer.py
+++ b/tests/trainers/test_ofa_trainer.py
@@ -39,7 +39,7 @@ class TestOfaTrainer(unittest.TestCase):
                        'dataloader': {'batch_size_per_gpu': 1, 'workers_per_gpu': 0},
                        'lr_scheduler': {'name': 'polynomial_decay',
                                         'warmup_proportion': 0.01,
-                                        'lr_endo': 1e-07},
+                                        'lr_end': 1e-07},
                        'lr_scheduler_hook': {'type': 'LrSchedulerHook', 'by_epoch': False},
                        'optimizer': {'type': 'AdamW', 'lr': 5e-05, 'weight_decay': 0.01},
                        'optimizer_hook': {'type': 'TorchAMPOptimizerHook',

From 9e3f035fa71ef15c6dcf10a4388630e6c634dc40 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=A1=8C=E5=97=94?= <yichang.zyc@alibaba-inc.com>
Date: Tue, 25 Oct 2022 10:13:48 +0800
Subject: [PATCH 25/54] fix a ut bug

---
 tests/trainers/test_ofa_trainer.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/trainers/test_ofa_trainer.py b/tests/trainers/test_ofa_trainer.py
index 786599bb..46dc5c8b 100644
--- a/tests/trainers/test_ofa_trainer.py
+++ b/tests/trainers/test_ofa_trainer.py
@@ -79,7 +79,7 @@ class TestOfaTrainer(unittest.TestCase):
         with open(config_file, 'w') as writer:
             json.dump(self.finetune_cfg, writer)
 
-        pretrained_model = 'damo/ofa_image-caption_coco_large_en'
+        pretrained_model = 'damo/ofa_image-caption_coco_distilled_en'
         args = dict(
             model=pretrained_model,
             work_dir=WORKSPACE,
@@ -97,8 +97,8 @@ class TestOfaTrainer(unittest.TestCase):
         trainer.train()
 
         self.assertIn(ModelFile.TORCH_MODEL_BIN_FILE,
-                      os.path.join(WORKSPACE, 'output'))
-        shutil.rmtree(WORKSPACE)
+                      os.listdir(os.path.join(WORKSPACE, 'output')))
+        # shutil.rmtree(WORKSPACE)
 
 
 if __name__ == '__main__':

From df5bd86048618b7496bde0c8050bc47092a0f68c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=A1=8C=E5=97=94?= <yichang.zyc@alibaba-inc.com>
Date: Tue, 25 Oct 2022 10:15:06 +0800
Subject: [PATCH 26/54] fix a ut bug

---
 tests/trainers/test_ofa_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/trainers/test_ofa_trainer.py b/tests/trainers/test_ofa_trainer.py
index 46dc5c8b..75b8cbbf 100644
--- a/tests/trainers/test_ofa_trainer.py
+++ b/tests/trainers/test_ofa_trainer.py
@@ -98,7 +98,7 @@ class TestOfaTrainer(unittest.TestCase):
 
         self.assertIn(ModelFile.TORCH_MODEL_BIN_FILE,
                       os.listdir(os.path.join(WORKSPACE, 'output')))
-        # shutil.rmtree(WORKSPACE)
+        shutil.rmtree(WORKSPACE)
 
 
 if __name__ == '__main__':

From 2288a0fdf34e3c64e39b44d116bd7eabc9f66440 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=A1=8C=E5=97=94?= <yichang.zyc@alibaba-inc.com>
Date: Tue, 25 Oct 2022 10:18:33 +0800
Subject: [PATCH 27/54] fix all comments

---
 modelscope/metainfo.py                             | 2 +-
 modelscope/preprocessors/multi_modal.py            | 6 ++----
 modelscope/trainers/multi_modal/ofa/ofa_trainer.py | 2 +-
 tests/trainers/test_ofa_trainer.py                 | 2 +-
 4 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 48d37eb2..d3e4904e 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -282,7 +282,7 @@ class Trainers(object):
 
     # multi-modal trainers
     clip_multi_modal_embedding = 'clip-multi-modal-embedding'
-    ofa_tasks = 'ofa'
+    ofa = 'ofa'
 
     # cv trainers
     image_instance_segmentation = 'image-instance-segmentation'
diff --git a/modelscope/preprocessors/multi_modal.py b/modelscope/preprocessors/multi_modal.py
index 3c4ac58a..256c5243 100644
--- a/modelscope/preprocessors/multi_modal.py
+++ b/modelscope/preprocessors/multi_modal.py
@@ -74,9 +74,7 @@ class OfaPreprocessor(Preprocessor):
             data[key] = item
         return data
 
-    def _compatible_with_pretrain(self, data):
-        # 预训练的时候使用的image都是经过pil转换的，PIL save的时候一般会进行有损压缩，为了保证和预训练一致
-        # 所以增加了这个逻辑
+    def _ofa_input_compatibility_conversion(self, data):
         if 'image' in data and self.cfg.model.get('type', None) == 'ofa':
             if isinstance(data['image'], str):
                 image = load_image(data['image'])
@@ -95,7 +93,7 @@ class OfaPreprocessor(Preprocessor):
             data = input
         else:
             data = self._build_dict(input)
-        data = self._compatible_with_pretrain(data)
+        data = self._ofa_input_compatibility_conversion(data)
         sample = self.preprocess(data)
         str_data = dict()
         for k, v in data.items():
diff --git a/modelscope/trainers/multi_modal/ofa/ofa_trainer.py b/modelscope/trainers/multi_modal/ofa/ofa_trainer.py
index c287c182..02853925 100644
--- a/modelscope/trainers/multi_modal/ofa/ofa_trainer.py
+++ b/modelscope/trainers/multi_modal/ofa/ofa_trainer.py
@@ -27,7 +27,7 @@ from .ofa_trainer_utils import (AdjustLabelSmoothedCrossEntropyCriterion,
                                 get_schedule)
 
 
-@TRAINERS.register_module(module_name=Trainers.ofa_tasks)
+@TRAINERS.register_module(module_name=Trainers.ofa)
 class OFATrainer(EpochBasedTrainer):
 
     def __init__(
diff --git a/tests/trainers/test_ofa_trainer.py b/tests/trainers/test_ofa_trainer.py
index 75b8cbbf..06003625 100644
--- a/tests/trainers/test_ofa_trainer.py
+++ b/tests/trainers/test_ofa_trainer.py
@@ -93,7 +93,7 @@ class TestOfaTrainer(unittest.TestCase):
                 split='validation[:10]'),
             metrics=[Metrics.BLEU],
             cfg_file=config_file)
-        trainer = build_trainer(name=Trainers.ofa_tasks, default_args=args)
+        trainer = build_trainer(name=Trainers.ofa, default_args=args)
         trainer.train()
 
         self.assertIn(ModelFile.TORCH_MODEL_BIN_FILE,

From 1bb1eeec775c6bf63c440a1a148e34400959136a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=BF=8E=E8=88=AA?= <liugao.lg@alibaba-inc.com>
Date: Tue, 25 Oct 2022 10:55:24 +0800
Subject: [PATCH 28/54] fix ut

---
 tests/trainers/test_ofa_trainer.py                        | 7 +++----
 tests/trainers/workspace/ckpts/caption/configuration.json | 1 +
 2 files changed, 4 insertions(+), 4 deletions(-)
 create mode 100644 tests/trainers/workspace/ckpts/caption/configuration.json

diff --git a/tests/trainers/test_ofa_trainer.py b/tests/trainers/test_ofa_trainer.py
index ac2e0678..9a8a7d90 100644
--- a/tests/trainers/test_ofa_trainer.py
+++ b/tests/trainers/test_ofa_trainer.py
@@ -86,9 +86,8 @@ class TestOfaTrainer(unittest.TestCase):
             model=pretrained_model,
             work_dir=WORKSPACE,
             train_dataset=MsDataset.load(
-                'coco_2014_caption',
-                namespace='modelscope',
-                split='train[:12]'),
+                'coco_2014_caption', namespace='modelscope',
+                split='train[:4]'),
             eval_dataset=MsDataset.load(
                 'coco_2014_caption',
                 namespace='modelscope',
@@ -99,7 +98,7 @@ class TestOfaTrainer(unittest.TestCase):
         trainer.train()
 
         self.assertIn(ModelFile.TORCH_MODEL_BIN_FILE,
-                      os.path.join(WORKSPACE, 'output'))
+                      os.listdir(os.path.join(WORKSPACE, 'output')))
         shutil.rmtree(WORKSPACE)
 
 
diff --git a/tests/trainers/workspace/ckpts/caption/configuration.json b/tests/trainers/workspace/ckpts/caption/configuration.json
new file mode 100644
index 00000000..952693ba
--- /dev/null
+++ b/tests/trainers/workspace/ckpts/caption/configuration.json
@@ -0,0 +1 @@
+{"framework": "pytorch", "task": "image-captioning", "model": {"type": "ofa", "beam_search": {"beam_size": 5, "max_len_b": 16, "min_len": 1, "no_repeat_ngram_size": 0}, "seed": 7, "max_src_length": 256, "language": "en", "gen_type": "generation", "patch_image_size": 480, "max_image_size": 480, "imagenet_default_mean_and_std": false}, "pipeline": {"type": "image-captioning"}, "dataset": {"column_map": {"text": "caption"}}, "train": {"work_dir": "work/ckpts/caption", "max_epochs": 1, "use_fp16": true, "dataloader": {"batch_size_per_gpu": 4, "workers_per_gpu": 0}, "lr_scheduler": {"name": "polynomial_decay", "warmup_proportion": 0.01, "lr_end": 1e-07}, "lr_scheduler_hook": {"type": "LrSchedulerHook", "by_epoch": false}, "optimizer": {"type": "AdamW", "lr": 5e-05, "weight_decay": 0.01}, "optimizer_hook": {"type": "TorchAMPOptimizerHook", "cumulative_iters": 1, "grad_clip": {"max_norm": 1.0, "norm_type": 2}, "loss_keys": "loss"}, "criterion": {"name": "AdjustLabelSmoothedCrossEntropyCriterion", "constraint_range": null, "drop_worst_after": 0, "drop_worst_ratio": 0.0, "ignore_eos": false, "ignore_prefix_size": 0, "label_smoothing": 0.0, "reg_alpha": 1.0, "report_accuracy": false, "sample_patch_num": 196, "sentence_avg": false, "use_rdrop": true}, "hooks": [{"type": "BestCkptSaverHook", "metric_key": "bleu-4", "interval": 100}, {"type": "TextLoggerHook", "interval": 1}, {"type": "IterTimerHook"}, {"type": "EvaluationHook", "by_epoch": true, "interval": 1}]}, "evaluation": {"dataloader": {"batch_size_per_gpu": 4, "workers_per_gpu": 0}, "metrics": [{"type": "bleu", "eval_tokenized_bleu": false, "ref_name": "labels", "hyp_name": "caption"}]}, "preprocessor": []}

From cc8b78eac8ae5c4a6288c04fdb9fc370527273e1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=A1=8C=E5=97=94?= <yichang.zyc@alibaba-inc.com>
Date: Tue, 25 Oct 2022 11:55:37 +0800
Subject: [PATCH 29/54] update rdrop

---
 modelscope/trainers/multi_modal/ofa/ofa_trainer.py       | 2 +-
 modelscope/trainers/multi_modal/ofa/ofa_trainer_utils.py | 2 +-
 tests/trainers/test_ofa_trainer.py                       | 7 +++----
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/modelscope/trainers/multi_modal/ofa/ofa_trainer.py b/modelscope/trainers/multi_modal/ofa/ofa_trainer.py
index 34919fb2..c36a886e 100644
--- a/modelscope/trainers/multi_modal/ofa/ofa_trainer.py
+++ b/modelscope/trainers/multi_modal/ofa/ofa_trainer.py
@@ -131,7 +131,7 @@ class OFATrainer(EpochBasedTrainer):
         model.train()
         # model_outputs = model.forward(inputs)
         loss, sample_size, logging_output = self.criterion(model, inputs)
-        train_outputs = {'loss': loss / 100}
+        train_outputs = {'loss': loss}
         # add model output info to log
         if 'log_vars' not in train_outputs:
             default_keys_pattern = ['loss']
diff --git a/modelscope/trainers/multi_modal/ofa/ofa_trainer_utils.py b/modelscope/trainers/multi_modal/ofa/ofa_trainer_utils.py
index 3ba5c91f..3c38884c 100644
--- a/modelscope/trainers/multi_modal/ofa/ofa_trainer_utils.py
+++ b/modelscope/trainers/multi_modal/ofa/ofa_trainer_utils.py
@@ -144,7 +144,7 @@ class AdjustLabelSmoothedCrossEntropyCriterion(_Loss):
         sample_size = (
             sample['target'].size(0) if self.sentence_avg else ntokens)
         logging_output = {
-            'loss': loss.data / 100,
+            'loss': loss.data,
             'nll_loss': nll_loss.data,
             'ntokens': sample['ntokens'],
             'nsentences': sample['nsentences'],
diff --git a/tests/trainers/test_ofa_trainer.py b/tests/trainers/test_ofa_trainer.py
index 5c252e0a..21ddce21 100644
--- a/tests/trainers/test_ofa_trainer.py
+++ b/tests/trainers/test_ofa_trainer.py
@@ -1,7 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-import glob
 import os
-import os.path as osp
 import shutil
 import unittest
 
@@ -98,8 +96,9 @@ class TestOfaTrainer(unittest.TestCase):
         trainer = build_trainer(name=Trainers.ofa, default_args=args)
         trainer.train()
 
-        self.assertIn(ModelFile.TORCH_MODEL_BIN_FILE,
-                      os.listdir(os.path.join(WORKSPACE, 'output')))
+        self.assertIn(
+            ModelFile.TORCH_MODEL_BIN_FILE,
+            os.listdir(os.path.join(WORKSPACE, ModelFile.TRAIN_OUTPUT_DIR)))
         shutil.rmtree(WORKSPACE)
 
 

From c077dea07213e599f109e7f2fe94bea5d27baaf3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=BF=8E=E8=88=AA?= <liugao.lg@alibaba-inc.com>
Date: Wed, 26 Oct 2022 10:52:10 +0800
Subject: [PATCH 30/54] add ocr-finetune

---
 modelscope/metrics/accuracy_metric.py         |  7 ++
 modelscope/metrics/ned_metric.py              | 56 +++++++++++++
 .../preprocessors/ofa/ocr_recognition.py      | 22 ++++-
 requirements/multi-modal.txt                  |  1 +
 tests/trainers/test_ofa_trainer.py            | 83 +++++++++++++++----
 .../ckpts/caption/configuration.json          |  1 -
 6 files changed, 152 insertions(+), 18 deletions(-)
 create mode 100644 modelscope/metrics/ned_metric.py
 delete mode 100644 tests/trainers/workspace/ckpts/caption/configuration.json

diff --git a/modelscope/metrics/accuracy_metric.py b/modelscope/metrics/accuracy_metric.py
index 1761786e..8a9a7cce 100644
--- a/modelscope/metrics/accuracy_metric.py
+++ b/modelscope/metrics/accuracy_metric.py
@@ -27,6 +27,13 @@ class AccuracyMetric(Metric):
         label_name = OutputKeys.LABEL if OutputKeys.LABEL in inputs else OutputKeys.LABELS
         ground_truths = inputs[label_name]
         eval_results = outputs[label_name]
+        for key in [
+                OutputKeys.CAPTION, OutputKeys.TEXT, OutputKeys.BOXES,
+                OutputKeys.LABELS, OutputKeys.SCORES
+        ]:
+            if key in outputs and outputs[key] is not None:
+                eval_results = outputs[key]
+                break
         assert type(ground_truths) == type(eval_results)
         if isinstance(ground_truths, list):
             self.preds.extend(eval_results)
diff --git a/modelscope/metrics/ned_metric.py b/modelscope/metrics/ned_metric.py
new file mode 100644
index 00000000..1ab97aa8
--- /dev/null
+++ b/modelscope/metrics/ned_metric.py
@@ -0,0 +1,56 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Dict
+
+import numpy as np
+from similarity.normalized_levenshtein import NormalizedLevenshtein
+
+from modelscope.metainfo import Metrics
+from modelscope.outputs import OutputKeys
+from modelscope.utils.registry import default_group
+from .base import Metric
+from .builder import METRICS, MetricKeys
+
+
+@METRICS.register_module(group_key=default_group, module_name=Metrics.NED)
+class NedMetric(Metric):
+    """The metric computation class for classification classes.
+
+    This metric class calculates accuracy for the whole input batches.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.ned = NormalizedLevenshtein()
+        self.preds = []
+        self.labels = []
+
+    def add(self, outputs: Dict, inputs: Dict):
+        label_name = OutputKeys.LABEL if OutputKeys.LABEL in inputs else OutputKeys.LABELS
+        ground_truths = inputs[label_name]
+        eval_results = outputs[label_name]
+        for key in [
+                OutputKeys.CAPTION, OutputKeys.TEXT, OutputKeys.BOXES,
+                OutputKeys.LABELS, OutputKeys.SCORES
+        ]:
+            if key in outputs and outputs[key] is not None:
+                eval_results = outputs[key]
+                break
+        assert type(ground_truths) == type(eval_results)
+        if isinstance(ground_truths, list):
+            self.preds.extend(eval_results)
+            self.labels.extend(ground_truths)
+        elif isinstance(ground_truths, np.ndarray):
+            self.preds.extend(eval_results.tolist())
+            self.labels.extend(ground_truths.tolist())
+        else:
+            raise 'only support list or np.ndarray'
+
+    def evaluate(self):
+        assert len(self.preds) == len(self.labels)
+        return {
+            MetricKeys.NED: (np.asarray([
+                self.ned.distance(pred, ref)
+                for pred, ref in zip(self.preds, self.labels)
+            ])).mean().item()
+        }
diff --git a/modelscope/preprocessors/ofa/ocr_recognition.py b/modelscope/preprocessors/ofa/ocr_recognition.py
index 1761dbd4..26fff9d2 100644
--- a/modelscope/preprocessors/ofa/ocr_recognition.py
+++ b/modelscope/preprocessors/ofa/ocr_recognition.py
@@ -91,8 +91,24 @@ class OfaOcrRecognitionPreprocessor(OfaBasePreprocessor):
         ])
 
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
-        image = data['image'] if isinstance(
-            data['image'], Image.Image) else load_image(data['image'])
+        if self.mode == ModeKeys.TRAIN:
+            return self._build_train_sample(data)
+        else:
+            return self._build_infer_sample(data)
+
+    def _build_train_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        sample = self._build_infer_sample(data)
+        target = data[self.column_map['text']]
+        target = target.translate(self.transtab).strip()
+        target_token_list = target.strip().split()
+        target = ' '.join(target_token_list[:self.max_tgt_length])
+        sample['target'] = self.tokenize_text(target, add_bos=False)
+        sample['prev_output_tokens'] = torch.cat(
+            [self.bos_item, sample['target'][:-1]])
+        return sample
+
+    def _build_infer_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        image = self.get_img_pil(data[self.column_map['image']])
         patch_image = self.patch_resize_transform(image)
         prompt = self.cfg.model.get('prompt', '图片上的文字是什么?')
         inputs = self.tokenize_text(prompt)
@@ -102,4 +118,6 @@ class OfaOcrRecognitionPreprocessor(OfaBasePreprocessor):
             'patch_image': patch_image,
             'patch_mask': torch.tensor([True])
         }
+        if 'text' in self.column_map and self.column_map['text'] in data:
+            sample['label'] = data[self.column_map['text']]
         return sample
diff --git a/requirements/multi-modal.txt b/requirements/multi-modal.txt
index 255f6155..4216475c 100644
--- a/requirements/multi-modal.txt
+++ b/requirements/multi-modal.txt
@@ -6,6 +6,7 @@ pycocotools>=2.0.4
 # which introduced compatability issues that are being investigated
 rouge_score<=0.0.4
 sacrebleu
+strsim
 taming-transformers-rom1504
 timm
 tokenizers
diff --git a/tests/trainers/test_ofa_trainer.py b/tests/trainers/test_ofa_trainer.py
index 21ddce21..20acbaac 100644
--- a/tests/trainers/test_ofa_trainer.py
+++ b/tests/trainers/test_ofa_trainer.py
@@ -15,9 +15,64 @@ from modelscope.utils.test_utils import test_level
 class TestOfaTrainer(unittest.TestCase):
 
     def setUp(self) -> None:
+        # self.finetune_cfg = \
+        #     {'framework': 'pytorch',
+        #      'task': 'image-captioning',
+        #      'model': {'type': 'ofa',
+        #                'beam_search': {'beam_size': 5,
+        #                                'max_len_b': 16,
+        #                                'min_len': 1,
+        #                                'no_repeat_ngram_size': 0},
+        #                'seed': 7,
+        #                'max_src_length': 256,
+        #                'language': 'en',
+        #                'gen_type': 'generation',
+        #                'patch_image_size': 480,
+        #                'max_image_size': 480,
+        #                'imagenet_default_mean_and_std': False},
+        #      'pipeline': {'type': 'image-captioning'},
+        #      'dataset': {'column_map': {'text': 'caption'}},
+        #      'train': {'work_dir': 'work/ckpts/caption',
+        #                # 'launcher': 'pytorch',
+        #                'max_epochs': 1,
+        #                'use_fp16': True,
+        #                'dataloader': {'batch_size_per_gpu': 4, 'workers_per_gpu': 0},
+        #                'lr_scheduler': {'name': 'polynomial_decay',
+        #                                 'warmup_proportion': 0.01,
+        #                                 'lr_end': 1e-07},
+        #                'lr_scheduler_hook': {'type': 'LrSchedulerHook', 'by_epoch': False},
+        #                'optimizer': {'type': 'AdamW', 'lr': 5e-05, 'weight_decay': 0.01},
+        #                'optimizer_hook': {'type': 'TorchAMPOptimizerHook',
+        #                                   'cumulative_iters': 1,
+        #                                   'grad_clip': {'max_norm': 1.0, 'norm_type': 2},
+        #                                   'loss_keys': 'loss'},
+        #                'criterion': {'name': 'AdjustLabelSmoothedCrossEntropyCriterion',
+        #                              'constraint_range': None,
+        #                              'drop_worst_after': 0,
+        #                              'drop_worst_ratio': 0.0,
+        #                              'ignore_eos': False,
+        #                              'ignore_prefix_size': 0,
+        #                              'label_smoothing': 0.1,
+        #                              'reg_alpha': 1.0,
+        #                              'report_accuracy': False,
+        #                              'sample_patch_num': 196,
+        #                              'sentence_avg': False,
+        #                              'use_rdrop': True},
+        #                'hooks': [{'type': 'BestCkptSaverHook',
+        #                           'metric_key': 'bleu-4',
+        #                           'interval': 100},
+        #                          {'type': 'TextLoggerHook', 'interval': 1},
+        #                          {'type': 'IterTimerHook'},
+        #                          {'type': 'EvaluationHook', 'by_epoch': True, 'interval': 1}]},
+        #      'evaluation': {'dataloader': {'batch_size_per_gpu': 4, 'workers_per_gpu': 0},
+        #                     'metrics': [{'type': 'bleu',
+        #                                  'eval_tokenized_bleu': False,
+        #                                  'ref_name': 'labels',
+        #                                  'hyp_name': 'caption'}]},
+        #      'preprocessor': []}
         self.finetune_cfg = \
             {'framework': 'pytorch',
-             'task': 'image-captioning',
+             'task': 'ocr-recognition',
              'model': {'type': 'ofa',
                        'beam_search': {'beam_size': 5,
                                        'max_len_b': 16,
@@ -25,18 +80,19 @@ class TestOfaTrainer(unittest.TestCase):
                                        'no_repeat_ngram_size': 0},
                        'seed': 7,
                        'max_src_length': 256,
-                       'language': 'en',
+                       'language': 'zh',
                        'gen_type': 'generation',
                        'patch_image_size': 480,
+                       'is_document': False,
                        'max_image_size': 480,
                        'imagenet_default_mean_and_std': False},
-             'pipeline': {'type': 'image-captioning'},
+             'pipeline': {'type': 'ofa-ocr-recognition'},
              'dataset': {'column_map': {'text': 'caption'}},
-             'train': {'work_dir': 'work/ckpts/caption',
+             'train': {'work_dir': 'work/ckpts/recognition',
                        # 'launcher': 'pytorch',
                        'max_epochs': 1,
                        'use_fp16': True,
-                       'dataloader': {'batch_size_per_gpu': 1, 'workers_per_gpu': 0},
+                       'dataloader': {'batch_size_per_gpu': 4, 'workers_per_gpu': 0},
                        'lr_scheduler': {'name': 'polynomial_decay',
                                         'warmup_proportion': 0.01,
                                         'lr_end': 1e-07},
@@ -59,39 +115,36 @@ class TestOfaTrainer(unittest.TestCase):
                                      'sentence_avg': False,
                                      'use_rdrop': True},
                        'hooks': [{'type': 'BestCkptSaverHook',
-                                  'metric_key': 'bleu-4',
+                                  'metric_key': 'ned',
+                                  'rule': 'min',
                                   'interval': 100},
                                  {'type': 'TextLoggerHook', 'interval': 1},
                                  {'type': 'IterTimerHook'},
                                  {'type': 'EvaluationHook', 'by_epoch': True, 'interval': 1}]},
              'evaluation': {'dataloader': {'batch_size_per_gpu': 4, 'workers_per_gpu': 0},
-                            'metrics': [{'type': 'bleu',
-                                         'eval_tokenized_bleu': False,
-                                         'ref_name': 'labels',
-                                         'hyp_name': 'caption'}]},
+                            'metrics': [{'type': 'ned'}]},
              'preprocessor': []}
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_trainer_std(self):
-        WORKSPACE = './workspace/ckpts/caption'
+        WORKSPACE = './workspace/ckpts/recognition'
         os.makedirs(WORKSPACE, exist_ok=True)
         config_file = os.path.join(WORKSPACE, ModelFile.CONFIGURATION)
         with open(config_file, 'w') as writer:
             json.dump(self.finetune_cfg, writer)
 
-        pretrained_model = 'damo/ofa_image-caption_coco_distilled_en'
+        pretrained_model = 'damo/ofa_ocr-recognition_scene_base_zh'
         args = dict(
             model=pretrained_model,
             work_dir=WORKSPACE,
             train_dataset=MsDataset.load(
                 'coco_2014_caption',
                 namespace='modelscope',
-                split='train[:20]'),
+                split='train[:12]'),
             eval_dataset=MsDataset.load(
                 'coco_2014_caption',
                 namespace='modelscope',
-                split='validation[:10]'),
-            metrics=[Metrics.BLEU],
+                split='validation[:4]'),
             cfg_file=config_file)
         trainer = build_trainer(name=Trainers.ofa, default_args=args)
         trainer.train()
diff --git a/tests/trainers/workspace/ckpts/caption/configuration.json b/tests/trainers/workspace/ckpts/caption/configuration.json
deleted file mode 100644
index 952693ba..00000000
--- a/tests/trainers/workspace/ckpts/caption/configuration.json
+++ /dev/null
@@ -1 +0,0 @@
-{"framework": "pytorch", "task": "image-captioning", "model": {"type": "ofa", "beam_search": {"beam_size": 5, "max_len_b": 16, "min_len": 1, "no_repeat_ngram_size": 0}, "seed": 7, "max_src_length": 256, "language": "en", "gen_type": "generation", "patch_image_size": 480, "max_image_size": 480, "imagenet_default_mean_and_std": false}, "pipeline": {"type": "image-captioning"}, "dataset": {"column_map": {"text": "caption"}}, "train": {"work_dir": "work/ckpts/caption", "max_epochs": 1, "use_fp16": true, "dataloader": {"batch_size_per_gpu": 4, "workers_per_gpu": 0}, "lr_scheduler": {"name": "polynomial_decay", "warmup_proportion": 0.01, "lr_end": 1e-07}, "lr_scheduler_hook": {"type": "LrSchedulerHook", "by_epoch": false}, "optimizer": {"type": "AdamW", "lr": 5e-05, "weight_decay": 0.01}, "optimizer_hook": {"type": "TorchAMPOptimizerHook", "cumulative_iters": 1, "grad_clip": {"max_norm": 1.0, "norm_type": 2}, "loss_keys": "loss"}, "criterion": {"name": "AdjustLabelSmoothedCrossEntropyCriterion", "constraint_range": null, "drop_worst_after": 0, "drop_worst_ratio": 0.0, "ignore_eos": false, "ignore_prefix_size": 0, "label_smoothing": 0.0, "reg_alpha": 1.0, "report_accuracy": false, "sample_patch_num": 196, "sentence_avg": false, "use_rdrop": true}, "hooks": [{"type": "BestCkptSaverHook", "metric_key": "bleu-4", "interval": 100}, {"type": "TextLoggerHook", "interval": 1}, {"type": "IterTimerHook"}, {"type": "EvaluationHook", "by_epoch": true, "interval": 1}]}, "evaluation": {"dataloader": {"batch_size_per_gpu": 4, "workers_per_gpu": 0}, "metrics": [{"type": "bleu", "eval_tokenized_bleu": false, "ref_name": "labels", "hyp_name": "caption"}]}, "preprocessor": []}

From 90d47832c07d3e5275cc5f1acbb7e21e48bf30ff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=BF=8E=E8=88=AA?= <liugao.lg@alibaba-inc.com>
Date: Wed, 26 Oct 2022 11:45:50 +0800
Subject: [PATCH 31/54] add ocr-finetune ned

---
 modelscope/metrics/accuracy_metric.py |  2 +-
 modelscope/metrics/ned_metric.py      | 41 ++++++++++++++++++++++++---
 tests/trainers/test_ofa_trainer.py    |  9 +++---
 3 files changed, 43 insertions(+), 9 deletions(-)

diff --git a/modelscope/metrics/accuracy_metric.py b/modelscope/metrics/accuracy_metric.py
index 8a9a7cce..5459ae66 100644
--- a/modelscope/metrics/accuracy_metric.py
+++ b/modelscope/metrics/accuracy_metric.py
@@ -42,7 +42,7 @@ class AccuracyMetric(Metric):
             self.preds.extend(eval_results.tolist())
             self.labels.extend(ground_truths.tolist())
         else:
-            raise 'only support list or np.ndarray'
+            raise Exception('only support list or np.ndarray')
 
     def evaluate(self):
         assert len(self.preds) == len(self.labels)
diff --git a/modelscope/metrics/ned_metric.py b/modelscope/metrics/ned_metric.py
index 1ab97aa8..0a2141cb 100644
--- a/modelscope/metrics/ned_metric.py
+++ b/modelscope/metrics/ned_metric.py
@@ -14,9 +14,9 @@ from .builder import METRICS, MetricKeys
 
 @METRICS.register_module(group_key=default_group, module_name=Metrics.NED)
 class NedMetric(Metric):
-    """The metric computation class for classification classes.
+    """The ned metric computation class for classification classes.
 
-    This metric class calculates accuracy for the whole input batches.
+    This metric class calculates the levenshtein distance between sentences for the whole input batches.
     """
 
     def __init__(self, *args, **kwargs):
@@ -44,13 +44,46 @@ class NedMetric(Metric):
             self.preds.extend(eval_results.tolist())
             self.labels.extend(ground_truths.tolist())
         else:
-            raise 'only support list or np.ndarray'
+            raise Exception('only support list or np.ndarray')
 
     def evaluate(self):
         assert len(self.preds) == len(self.labels)
         return {
             MetricKeys.NED: (np.asarray([
-                self.ned.distance(pred, ref)
+                1.0 - NedMetric._distance(pred, ref)
                 for pred, ref in zip(self.preds, self.labels)
             ])).mean().item()
         }
+
+    @staticmethod
+    def _distance(pred, ref):
+        if pred is None or ref is None:
+            raise TypeError('Argument s0 is NoneType.')
+        if pred == ref:
+            return 0.0
+        if len(pred) == 0:
+            return len(ref)
+        if len(ref) == 0:
+            return len(pred)
+        m_len = max(len(pred), len(ref))
+        if m_len == 0:
+            return 0.0
+
+        def levenshtein(s0, s1):
+            v0 = [0] * (len(s1) + 1)
+            v1 = [0] * (len(s1) + 1)
+
+            for i in range(len(v0)):
+                v0[i] = i
+
+            for i in range(len(s0)):
+                v1[0] = i + 1
+                for j in range(len(s1)):
+                    cost = 1
+                    if s0[i] == s1[j]:
+                        cost = 0
+                    v1[j + 1] = min(v1[j] + 1, v0[j + 1] + 1, v0[j] + cost)
+                v0, v1 = v1, v0
+            return v0[len(s1)]
+
+        return levenshtein(pred, ref) / m_len
diff --git a/tests/trainers/test_ofa_trainer.py b/tests/trainers/test_ofa_trainer.py
index 20acbaac..f627a419 100644
--- a/tests/trainers/test_ofa_trainer.py
+++ b/tests/trainers/test_ofa_trainer.py
@@ -87,7 +87,7 @@ class TestOfaTrainer(unittest.TestCase):
                        'max_image_size': 480,
                        'imagenet_default_mean_and_std': False},
              'pipeline': {'type': 'ofa-ocr-recognition'},
-             'dataset': {'column_map': {'text': 'caption'}},
+             'dataset': {'column_map': {'text': 'label'}},
              'train': {'work_dir': 'work/ckpts/recognition',
                        # 'launcher': 'pytorch',
                        'max_epochs': 1,
@@ -116,7 +116,6 @@ class TestOfaTrainer(unittest.TestCase):
                                      'use_rdrop': True},
                        'hooks': [{'type': 'BestCkptSaverHook',
                                   'metric_key': 'ned',
-                                  'rule': 'min',
                                   'interval': 100},
                                  {'type': 'TextLoggerHook', 'interval': 1},
                                  {'type': 'IterTimerHook'},
@@ -138,11 +137,13 @@ class TestOfaTrainer(unittest.TestCase):
             model=pretrained_model,
             work_dir=WORKSPACE,
             train_dataset=MsDataset.load(
-                'coco_2014_caption',
+                'ocr_fudanvi_zh',
+                subset_name='scene',
                 namespace='modelscope',
                 split='train[:12]'),
             eval_dataset=MsDataset.load(
-                'coco_2014_caption',
+                'ocr_fudanvi_zh',
+                subset_name='scene',
                 namespace='modelscope',
                 split='validation[:4]'),
             cfg_file=config_file)

From 9d45274fbfa03866932dfb6225847c66d72554be Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=BF=8E=E8=88=AA?= <liugao.lg@alibaba-inc.com>
Date: Wed, 26 Oct 2022 11:47:21 +0800
Subject: [PATCH 32/54] add ocr-finetune ned

---
 modelscope/metrics/ned_metric.py | 1 -
 requirements/multi-modal.txt     | 1 -
 2 files changed, 2 deletions(-)

diff --git a/modelscope/metrics/ned_metric.py b/modelscope/metrics/ned_metric.py
index 0a2141cb..8a015f9c 100644
--- a/modelscope/metrics/ned_metric.py
+++ b/modelscope/metrics/ned_metric.py
@@ -3,7 +3,6 @@
 from typing import Dict
 
 import numpy as np
-from similarity.normalized_levenshtein import NormalizedLevenshtein
 
 from modelscope.metainfo import Metrics
 from modelscope.outputs import OutputKeys
diff --git a/requirements/multi-modal.txt b/requirements/multi-modal.txt
index 4216475c..255f6155 100644
--- a/requirements/multi-modal.txt
+++ b/requirements/multi-modal.txt
@@ -6,7 +6,6 @@ pycocotools>=2.0.4
 # which introduced compatability issues that are being investigated
 rouge_score<=0.0.4
 sacrebleu
-strsim
 taming-transformers-rom1504
 timm
 tokenizers

From 5dd9698a33d05ee50c8b270a3327a7dd7ef1eda5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=BF=8E=E8=88=AA?= <liugao.lg@alibaba-inc.com>
Date: Wed, 26 Oct 2022 11:48:22 +0800
Subject: [PATCH 33/54] fix ocr-finetune ned

---
 modelscope/metrics/ned_metric.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/modelscope/metrics/ned_metric.py b/modelscope/metrics/ned_metric.py
index 8a015f9c..6775b838 100644
--- a/modelscope/metrics/ned_metric.py
+++ b/modelscope/metrics/ned_metric.py
@@ -20,7 +20,6 @@ class NedMetric(Metric):
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.ned = NormalizedLevenshtein()
         self.preds = []
         self.labels = []
 

From 022fa4948aa2dce54225b117ca98d6f57bb67b88 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=BF=8E=E8=88=AA?= <liugao.lg@alibaba-inc.com>
Date: Wed, 26 Oct 2022 19:44:54 +0800
Subject: [PATCH 34/54] fix ocr-finetune acc

---
 modelscope/metrics/accuracy_metric.py | 15 +++---
 modelscope/metrics/ned_metric.py      |  2 +-
 tests/trainers/test_ofa_trainer.py    | 71 ++++-----------------------
 3 files changed, 17 insertions(+), 71 deletions(-)

diff --git a/modelscope/metrics/accuracy_metric.py b/modelscope/metrics/accuracy_metric.py
index 5459ae66..953ece4c 100644
--- a/modelscope/metrics/accuracy_metric.py
+++ b/modelscope/metrics/accuracy_metric.py
@@ -35,14 +35,13 @@ class AccuracyMetric(Metric):
                 eval_results = outputs[key]
                 break
         assert type(ground_truths) == type(eval_results)
-        if isinstance(ground_truths, list):
-            self.preds.extend(eval_results)
-            self.labels.extend(ground_truths)
-        elif isinstance(ground_truths, np.ndarray):
-            self.preds.extend(eval_results.tolist())
-            self.labels.extend(ground_truths.tolist())
-        else:
-            raise Exception('only support list or np.ndarray')
+        for truth in ground_truths:
+            self.labels.append(truth)
+        for result in eval_results:
+            if isinstance(truth, str):
+                self.preds.append(result.strip().replace(' ', ''))
+            else:
+                self.preds.append(result)
 
     def evaluate(self):
         assert len(self.preds) == len(self.labels)
diff --git a/modelscope/metrics/ned_metric.py b/modelscope/metrics/ned_metric.py
index 6775b838..e87bb2c4 100644
--- a/modelscope/metrics/ned_metric.py
+++ b/modelscope/metrics/ned_metric.py
@@ -56,7 +56,7 @@ class NedMetric(Metric):
     @staticmethod
     def _distance(pred, ref):
         if pred is None or ref is None:
-            raise TypeError('Argument s0 is NoneType.')
+            raise TypeError('Argument (pred or ref) is NoneType.')
         if pred == ref:
             return 0.0
         if len(pred) == 0:
diff --git a/tests/trainers/test_ofa_trainer.py b/tests/trainers/test_ofa_trainer.py
index f627a419..783f08f4 100644
--- a/tests/trainers/test_ofa_trainer.py
+++ b/tests/trainers/test_ofa_trainer.py
@@ -8,78 +8,23 @@ import json
 from modelscope.metainfo import Metrics, Trainers
 from modelscope.msdatasets import MsDataset
 from modelscope.trainers import build_trainer
-from modelscope.utils.constant import ModelFile
+from modelscope.utils.constant import DownloadMode, ModelFile
 from modelscope.utils.test_utils import test_level
 
 
 class TestOfaTrainer(unittest.TestCase):
 
     def setUp(self) -> None:
-        # self.finetune_cfg = \
-        #     {'framework': 'pytorch',
-        #      'task': 'image-captioning',
-        #      'model': {'type': 'ofa',
-        #                'beam_search': {'beam_size': 5,
-        #                                'max_len_b': 16,
-        #                                'min_len': 1,
-        #                                'no_repeat_ngram_size': 0},
-        #                'seed': 7,
-        #                'max_src_length': 256,
-        #                'language': 'en',
-        #                'gen_type': 'generation',
-        #                'patch_image_size': 480,
-        #                'max_image_size': 480,
-        #                'imagenet_default_mean_and_std': False},
-        #      'pipeline': {'type': 'image-captioning'},
-        #      'dataset': {'column_map': {'text': 'caption'}},
-        #      'train': {'work_dir': 'work/ckpts/caption',
-        #                # 'launcher': 'pytorch',
-        #                'max_epochs': 1,
-        #                'use_fp16': True,
-        #                'dataloader': {'batch_size_per_gpu': 4, 'workers_per_gpu': 0},
-        #                'lr_scheduler': {'name': 'polynomial_decay',
-        #                                 'warmup_proportion': 0.01,
-        #                                 'lr_end': 1e-07},
-        #                'lr_scheduler_hook': {'type': 'LrSchedulerHook', 'by_epoch': False},
-        #                'optimizer': {'type': 'AdamW', 'lr': 5e-05, 'weight_decay': 0.01},
-        #                'optimizer_hook': {'type': 'TorchAMPOptimizerHook',
-        #                                   'cumulative_iters': 1,
-        #                                   'grad_clip': {'max_norm': 1.0, 'norm_type': 2},
-        #                                   'loss_keys': 'loss'},
-        #                'criterion': {'name': 'AdjustLabelSmoothedCrossEntropyCriterion',
-        #                              'constraint_range': None,
-        #                              'drop_worst_after': 0,
-        #                              'drop_worst_ratio': 0.0,
-        #                              'ignore_eos': False,
-        #                              'ignore_prefix_size': 0,
-        #                              'label_smoothing': 0.1,
-        #                              'reg_alpha': 1.0,
-        #                              'report_accuracy': False,
-        #                              'sample_patch_num': 196,
-        #                              'sentence_avg': False,
-        #                              'use_rdrop': True},
-        #                'hooks': [{'type': 'BestCkptSaverHook',
-        #                           'metric_key': 'bleu-4',
-        #                           'interval': 100},
-        #                          {'type': 'TextLoggerHook', 'interval': 1},
-        #                          {'type': 'IterTimerHook'},
-        #                          {'type': 'EvaluationHook', 'by_epoch': True, 'interval': 1}]},
-        #      'evaluation': {'dataloader': {'batch_size_per_gpu': 4, 'workers_per_gpu': 0},
-        #                     'metrics': [{'type': 'bleu',
-        #                                  'eval_tokenized_bleu': False,
-        #                                  'ref_name': 'labels',
-        #                                  'hyp_name': 'caption'}]},
-        #      'preprocessor': []}
         self.finetune_cfg = \
             {'framework': 'pytorch',
              'task': 'ocr-recognition',
              'model': {'type': 'ofa',
                        'beam_search': {'beam_size': 5,
-                                       'max_len_b': 16,
+                                       'max_len_b': 64,
                                        'min_len': 1,
                                        'no_repeat_ngram_size': 0},
                        'seed': 7,
-                       'max_src_length': 256,
+                       'max_src_length': 128,
                        'language': 'zh',
                        'gen_type': 'generation',
                        'patch_image_size': 480,
@@ -115,13 +60,13 @@ class TestOfaTrainer(unittest.TestCase):
                                      'sentence_avg': False,
                                      'use_rdrop': True},
                        'hooks': [{'type': 'BestCkptSaverHook',
-                                  'metric_key': 'ned',
+                                  'metric_key': 'accuracy',
                                   'interval': 100},
                                  {'type': 'TextLoggerHook', 'interval': 1},
                                  {'type': 'IterTimerHook'},
                                  {'type': 'EvaluationHook', 'by_epoch': True, 'interval': 1}]},
              'evaluation': {'dataloader': {'batch_size_per_gpu': 4, 'workers_per_gpu': 0},
-                            'metrics': [{'type': 'ned'}]},
+                            'metrics': [{'type': 'accuracy'}]},
              'preprocessor': []}
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
@@ -140,12 +85,14 @@ class TestOfaTrainer(unittest.TestCase):
                 'ocr_fudanvi_zh',
                 subset_name='scene',
                 namespace='modelscope',
-                split='train[:12]'),
+                split='train[:1000]',
+                download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS),
             eval_dataset=MsDataset.load(
                 'ocr_fudanvi_zh',
                 subset_name='scene',
                 namespace='modelscope',
-                split='validation[:4]'),
+                split='test[:100]',
+                download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS),
             cfg_file=config_file)
         trainer = build_trainer(name=Trainers.ofa, default_args=args)
         trainer.train()

From 3b21ff10ec824b7ad6d062ce35c8cb7e990deec5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=BF=8E=E8=88=AA?= <liugao.lg@alibaba-inc.com>
Date: Mon, 31 Oct 2022 16:57:49 +0800
Subject: [PATCH 35/54] fix ocr prepreocess

---
 modelscope/preprocessors/multi_modal.py         |  1 -
 modelscope/preprocessors/ofa/ocr_recognition.py | 11 ++++++-----
 requirements/multi-modal.txt                    |  2 ++
 tests/trainers/test_ofa_trainer.py              |  5 ++---
 4 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/modelscope/preprocessors/multi_modal.py b/modelscope/preprocessors/multi_modal.py
index 256c5243..af241d83 100644
--- a/modelscope/preprocessors/multi_modal.py
+++ b/modelscope/preprocessors/multi_modal.py
@@ -93,7 +93,6 @@ class OfaPreprocessor(Preprocessor):
             data = input
         else:
             data = self._build_dict(input)
-        data = self._ofa_input_compatibility_conversion(data)
         sample = self.preprocess(data)
         str_data = dict()
         for k, v in data.items():
diff --git a/modelscope/preprocessors/ofa/ocr_recognition.py b/modelscope/preprocessors/ofa/ocr_recognition.py
index 26fff9d2..a0342c14 100644
--- a/modelscope/preprocessors/ofa/ocr_recognition.py
+++ b/modelscope/preprocessors/ofa/ocr_recognition.py
@@ -2,12 +2,12 @@
 from typing import Any, Dict
 
 import torch
-from PIL import Image
+import unicodedata2
 from torchvision import transforms
 from torchvision.transforms import InterpolationMode
 from torchvision.transforms import functional as F
+from zhconv import convert
 
-from modelscope.preprocessors.image import load_image
 from modelscope.utils.constant import ModeKeys
 from .base import OfaBasePreprocessor
 
@@ -98,8 +98,7 @@ class OfaOcrRecognitionPreprocessor(OfaBasePreprocessor):
 
     def _build_train_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
         sample = self._build_infer_sample(data)
-        target = data[self.column_map['text']]
-        target = target.translate(self.transtab).strip()
+        target = sample['label']
         target_token_list = target.strip().split()
         target = ' '.join(target_token_list[:self.max_tgt_length])
         sample['target'] = self.tokenize_text(target, add_bos=False)
@@ -119,5 +118,7 @@ class OfaOcrRecognitionPreprocessor(OfaBasePreprocessor):
             'patch_mask': torch.tensor([True])
         }
         if 'text' in self.column_map and self.column_map['text'] in data:
-            sample['label'] = data[self.column_map['text']]
+            target = data[self.column_map['text']]
+            target = unicodedata2.normalize('NFKC', convert(target, 'zh-hans'))
+            sample['label'] = target
         return sample
diff --git a/requirements/multi-modal.txt b/requirements/multi-modal.txt
index 255f6155..578f0b54 100644
--- a/requirements/multi-modal.txt
+++ b/requirements/multi-modal.txt
@@ -11,3 +11,5 @@ timm
 tokenizers
 torchvision
 transformers>=4.12.0
+unicodedata2
+zhconv
diff --git a/tests/trainers/test_ofa_trainer.py b/tests/trainers/test_ofa_trainer.py
index 3f68a9fb..6f96aea1 100644
--- a/tests/trainers/test_ofa_trainer.py
+++ b/tests/trainers/test_ofa_trainer.py
@@ -5,7 +5,7 @@ import unittest
 
 import json
 
-from modelscope.metainfo import Trainers
+from modelscope.metainfo import Metrics, Trainers
 from modelscope.msdatasets import MsDataset
 from modelscope.trainers import build_trainer
 from modelscope.utils.constant import DownloadMode, ModelFile
@@ -85,7 +85,7 @@ class TestOfaTrainer(unittest.TestCase):
                 'ocr_fudanvi_zh',
                 subset_name='scene',
                 namespace='modelscope',
-                split='train[:200]',
+                split='train[800:900]',
                 download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS),
             eval_dataset=MsDataset.load(
                 'ocr_fudanvi_zh',
@@ -96,7 +96,6 @@ class TestOfaTrainer(unittest.TestCase):
             cfg_file=config_file)
         trainer = build_trainer(name=Trainers.ofa, default_args=args)
         trainer.train()
-
         self.assertIn(
             ModelFile.TORCH_MODEL_BIN_FILE,
             os.listdir(os.path.join(WORKSPACE, ModelFile.TRAIN_OUTPUT_DIR)))

From fd3679b54783f7e9ba12b15a379a1615095f296d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=BF=8E=E8=88=AA?= <liugao.lg@alibaba-inc.com>
Date: Mon, 31 Oct 2022 20:00:42 +0800
Subject: [PATCH 36/54] add classification prepreocess

---
 .../preprocessors/ofa/image_classification.py | 90 ++++++++++++++++---
 .../preprocessors/ofa/ocr_recognition.py      | 12 +--
 2 files changed, 80 insertions(+), 22 deletions(-)

diff --git a/modelscope/preprocessors/ofa/image_classification.py b/modelscope/preprocessors/ofa/image_classification.py
index 49968823..ffac9070 100644
--- a/modelscope/preprocessors/ofa/image_classification.py
+++ b/modelscope/preprocessors/ofa/image_classification.py
@@ -1,13 +1,20 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import functools
 from typing import Any, Dict
 
 import torch
-from PIL import Image
+from PIL import Image, ImageFile
+from timm.data import create_transform
 from torchvision import transforms
 
 from modelscope.preprocessors.image import load_image
 from modelscope.utils.constant import ModeKeys
 from .base import OfaBasePreprocessor
+from .utils.vision_helper import RandomAugment
+
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+ImageFile.MAX_IMAGE_PIXELS = None
+Image.MAX_IMAGE_PIXELS = None
 
 
 class OfaImageClassificationPreprocessor(OfaBasePreprocessor):
@@ -28,18 +35,77 @@ class OfaImageClassificationPreprocessor(OfaBasePreprocessor):
         super(OfaImageClassificationPreprocessor,
               self).__init__(cfg, model_dir, mode, *args, **kwargs)
         # Initialize transform
-        self.patch_resize_transform = transforms.Compose([
-            lambda image: image.convert('RGB'),
-            transforms.Resize(
-                (self.patch_image_size, self.patch_image_size),
-                interpolation=transforms.InterpolationMode.BICUBIC),
-            transforms.ToTensor(),
-            transforms.Normalize(mean=self.mean, std=self.std),
-        ])
+        if self.mode != ModeKeys.TRAIN:
+            self.patch_resize_transform = transforms.Compose([
+                lambda image: image.convert('RGB'),
+                transforms.Resize(
+                    (self.patch_image_size, self.patch_image_size),
+                    interpolation=transforms.InterpolationMode.BICUBIC),
+                transforms.ToTensor(),
+                transforms.Normalize(mean=self.mean, std=self.std),
+            ])
+        else:
+            self.patch_resize_transform = create_transform(
+                input_size=self.patch_image_size,
+                is_training=True,
+                color_jitter=0.4,
+                auto_augment='rand-m9-mstd0.5-inc1',
+                interpolation='bicubic',
+                re_prob=0.25,
+                re_mode='pixel',
+                re_count=1,
+                mean=self.mean,
+                std=self.std)
+            self.patch_resize_transform = transforms.Compose(
+                functools.reduce(lambda x, y: x + y, [
+                    [
+                        lambda image: image.convert('RGB'),
+                    ],
+                    self.patch_resize_transform.transforms[:2],
+                    [self.patch_resize_transform.transforms[2]],
+                    [
+                        RandomAugment(
+                            2,
+                            7,
+                            isPIL=True,
+                            augs=[
+                                'Identity', 'AutoContrast', 'Equalize',
+                                'Brightness', 'Sharpness', 'ShearX', 'ShearY',
+                                'TranslateX', 'TranslateY', 'Rotate'
+                            ]),
+                    ],
+                    self.patch_resize_transform.transforms[3:],
+                ]))
 
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
-        image = data['image'] if isinstance(
-            data['image'], Image.Image) else load_image(data['image'])
+        if self.mode == ModeKeys.TRAIN:
+            return self._build_train_sample(data)
+        else:
+            return self._build_infer_sample(data)
+
+    def _build_train_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        sample = self._build_infer_sample(data)
+        target = ' {}'.format(data[self.column_map['text']])
+        sample['ref_dict'] = {data[self.column_map['text']]: 1.0}
+        sample['target'] = self.tokenize_text(target, add_bos=False)
+        sample['prev_output_tokens'] = torch.cat(
+            [self.bos_item, sample['target']])
+
+        if self.constraint_trie is not None:
+            constraint_mask = torch.zeros((len(sample['prev_output_tokens']),
+                                           len(self.tgt_dict))).bool()
+            for i in range(len(sample['prev_output_tokens'])):
+                constraint_prefix_token = sample[
+                    'prev_output_tokens'][:i + 1].tolist()
+                constraint_nodes = self.constraint_trie.get_next_layer(
+                    constraint_prefix_token)
+                constraint_mask[i][constraint_nodes] = True
+            sample['constraint_mask'] = constraint_mask
+
+        return sample
+
+    def _build_infer_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        image = self.get_img_pil(data[self.column_map['image']])
         patch_image = self.patch_resize_transform(image)
         prompt = self.cfg.model.get('prompt', ' what does the image describe?')
         inputs = self.tokenize_text(prompt)
@@ -48,4 +114,6 @@ class OfaImageClassificationPreprocessor(OfaBasePreprocessor):
             'patch_image': patch_image,
             'patch_mask': torch.tensor([True])
         }
+        if 'text' in self.column_map and self.column_map['text'] in data:
+            sample['label'] = data[self.column_map['text']]
         return sample
diff --git a/modelscope/preprocessors/ofa/ocr_recognition.py b/modelscope/preprocessors/ofa/ocr_recognition.py
index a0342c14..aa527325 100644
--- a/modelscope/preprocessors/ofa/ocr_recognition.py
+++ b/modelscope/preprocessors/ofa/ocr_recognition.py
@@ -11,9 +11,6 @@ from zhconv import convert
 from modelscope.utils.constant import ModeKeys
 from .base import OfaBasePreprocessor
 
-IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
-IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)
-
 
 def ocr_resize(img, patch_image_size, is_document=False):
     img = img.convert('RGB')
@@ -73,13 +70,6 @@ class OfaOcrRecognitionPreprocessor(OfaBasePreprocessor):
         """
         super(OfaOcrRecognitionPreprocessor,
               self).__init__(cfg, model_dir, mode, *args, **kwargs)
-        # Initialize transform
-        if self.cfg.model.imagenet_default_mean_and_std:
-            mean = IMAGENET_DEFAULT_MEAN
-            std = IMAGENET_DEFAULT_STD
-        else:
-            mean = [0.5, 0.5, 0.5]
-            std = [0.5, 0.5, 0.5]
 
         self.patch_resize_transform = transforms.Compose([
             lambda image: ocr_resize(
@@ -87,7 +77,7 @@ class OfaOcrRecognitionPreprocessor(OfaBasePreprocessor):
                 self.cfg.model.patch_image_size,
                 is_document=self.cfg.model.is_document),
             transforms.ToTensor(),
-            transforms.Normalize(mean=mean, std=std),
+            transforms.Normalize(mean=self.mean, std=self.std),
         ])
 
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:

From 9c04fec99cd038489b35ce002b2a26a2b3e3619f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=BF=8E=E8=88=AA?= <liugao.lg@alibaba-inc.com>
Date: Tue, 1 Nov 2022 14:28:57 +0800
Subject: [PATCH 37/54] add task preprocess

---
 .../preprocessors/ofa/image_captioning.py     |  2 +-
 .../preprocessors/ofa/image_classification.py |  6 +-
 .../preprocessors/ofa/ocr_recognition.py      |  4 +-
 modelscope/preprocessors/ofa/summarization.py | 36 ++++++-
 .../preprocessors/ofa/visual_entailment.py    | 50 +++++++++-
 .../preprocessors/ofa/visual_grounding.py     | 96 ++++++++++++++++---
 .../ofa/visual_question_answering.py          | 68 ++++++++++++-
 7 files changed, 237 insertions(+), 25 deletions(-)

diff --git a/modelscope/preprocessors/ofa/image_captioning.py b/modelscope/preprocessors/ofa/image_captioning.py
index af623297..5fb83908 100644
--- a/modelscope/preprocessors/ofa/image_captioning.py
+++ b/modelscope/preprocessors/ofa/image_captioning.py
@@ -43,7 +43,7 @@ class OfaImageCaptioningPreprocessor(OfaBasePreprocessor):
 
     def _build_train_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
         sample = self._build_infer_sample(data)
-        target = data[self.column_map['text']]
+        target = sample['label']
         target = target.translate(self.transtab).strip()
         target_token_list = target.strip().split()
         target = ' '.join(target_token_list[:self.max_tgt_length])
diff --git a/modelscope/preprocessors/ofa/image_classification.py b/modelscope/preprocessors/ofa/image_classification.py
index ffac9070..038a9e15 100644
--- a/modelscope/preprocessors/ofa/image_classification.py
+++ b/modelscope/preprocessors/ofa/image_classification.py
@@ -85,11 +85,11 @@ class OfaImageClassificationPreprocessor(OfaBasePreprocessor):
 
     def _build_train_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
         sample = self._build_infer_sample(data)
-        target = ' {}'.format(data[self.column_map['text']])
-        sample['ref_dict'] = {data[self.column_map['text']]: 1.0}
+        target = ' {}'.format(sample['label'])
+        sample['ref_dict'] = {sample['label']: 1.0}
         sample['target'] = self.tokenize_text(target, add_bos=False)
         sample['prev_output_tokens'] = torch.cat(
-            [self.bos_item, sample['target']])
+            [self.bos_item, sample['target'][:-1]])
 
         if self.constraint_trie is not None:
             constraint_mask = torch.zeros((len(sample['prev_output_tokens']),
diff --git a/modelscope/preprocessors/ofa/ocr_recognition.py b/modelscope/preprocessors/ofa/ocr_recognition.py
index aa527325..95dab492 100644
--- a/modelscope/preprocessors/ofa/ocr_recognition.py
+++ b/modelscope/preprocessors/ofa/ocr_recognition.py
@@ -109,6 +109,6 @@ class OfaOcrRecognitionPreprocessor(OfaBasePreprocessor):
         }
         if 'text' in self.column_map and self.column_map['text'] in data:
             target = data[self.column_map['text']]
-            target = unicodedata2.normalize('NFKC', convert(target, 'zh-hans'))
-            sample['label'] = target
+            sample['label'] = unicodedata2.normalize(
+                'NFKC', convert(target, 'zh-hans'))
         return sample
diff --git a/modelscope/preprocessors/ofa/summarization.py b/modelscope/preprocessors/ofa/summarization.py
index cfd3c23d..176600a9 100644
--- a/modelscope/preprocessors/ofa/summarization.py
+++ b/modelscope/preprocessors/ofa/summarization.py
@@ -1,6 +1,8 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Any, Dict
 
+import torch
+
 from modelscope.utils.constant import ModeKeys
 from .base import OfaBasePreprocessor
 
@@ -24,9 +26,27 @@ class OfaSummarizationPreprocessor(OfaBasePreprocessor):
               self).__init__(cfg, model_dir, mode, *args, **kwargs)
 
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        if self.mode == ModeKeys.TRAIN:
+            return self._build_train_sample(data)
+        else:
+            return self._build_infer_sample(data)
+
+    def _build_train_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        sample = self._build_infer_sample(data)
+        target_str = sample['label'].lower()
+        target = super().pre_caption(target_str, max_words=self.max_tgt_length)
+        target = target.replace('[unk]', 'unk').replace('<unk>', 'unk')
+        sample['target'] = self.tokenize_text(target, add_bos=False)
+        noise_target_item = self.add_noise_to_tgt(
+            sample['target'][:-1].clone())
+        sample['prev_output_tokens'] = torch.cat(
+            [self.bos_item, noise_target_item])
+        return sample
+
+    def _build_infer_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
         source = super().pre_caption(
-            data['text'], max_words=self.max_src_length)
-        source = source.strip()[:self.max_src_length]
+            data[self.column_map['text']], max_words=self.max_src_length)
+        # source = source.strip()[:self.max_src_length]
         source = source.replace('[unk]', 'unk').replace('<unk>', 'unk')
         prompt = self.cfg.model.get(
             'prompt', ' " {} " Summarize the article with a title: ')
@@ -42,4 +62,16 @@ class OfaSummarizationPreprocessor(OfaBasePreprocessor):
             'source': inputs,
             'decoder_prompt': decoder_prompt,
         }
+        if 'summary' in self.column_map and self.column_map['summary'] in data:
+            sample['label'] = data[self.column_map['summary']]
         return sample
+
+    def add_noise_to_tgt(self, target):
+        noise_indices = torch.FloatTensor(
+            target.size(0)).uniform_() < self.cfg.model.get(
+                'noise_ratio', 0.0)
+        target[noise_indices] = torch.randint(
+            4,
+            len(self.src_dict) - self.code_dict_size - self.num_bins,
+            size=(noise_indices.sum(), ))
+        return target
diff --git a/modelscope/preprocessors/ofa/visual_entailment.py b/modelscope/preprocessors/ofa/visual_entailment.py
index 61c3cc6a..aeba199c 100644
--- a/modelscope/preprocessors/ofa/visual_entailment.py
+++ b/modelscope/preprocessors/ofa/visual_entailment.py
@@ -38,8 +38,51 @@ class OfaVisualEntailmentPreprocessor(OfaBasePreprocessor):
         ])
 
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
-        image = data['image'] if isinstance(
-            data['image'], Image.Image) else load_image(data['image'])
+        if self.mode == ModeKeys.TRAIN:
+            return self._build_train_sample(data)
+        else:
+            return self._build_infer_sample(data)
+
+    def _build_train_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        sample = self._build_infer_sample(data)
+        target = ' {}'.format(sample['label'])
+        sample['ref_dict'] = {sample['label']: 1.0}
+        tgt_item = self.tokenize_text(target, add_bos=False, add_eos=False)
+
+        if self.prompt_type == 'none':
+            prev_output_item = torch.cat([self.bos_item, tgt_item])
+            target_item = torch.cat([prev_output_item[1:], self.eos_item])
+        elif self.prompt_type == 'src':
+            prev_output_item = torch.cat([sample['source'], tgt_item])
+            target_item = torch.cat([prev_output_item[1:], self.eos_item])
+        elif self.prompt_type == 'prev_output':
+            prev_output_item = torch.cat([sample['source'][:-1], tgt_item])
+            target_item = torch.cat([prev_output_item[1:], self.eos_item])
+        else:
+            raise NotImplementedError
+
+        target_item[:-len(tgt_item) - 1] = self.tgt_dict.pad()
+        sample['target'] = target_item
+        sample['prev_output_tokens'] = prev_output_item
+
+        if self.constraint_trie is not None:
+            constraint_mask = torch.zeros(
+                (len(target_item), len(self.tgt_dict))).bool()
+            start_idx = len(target_item) - len(tgt_item) - 1
+            for i in range(
+                    len(target_item) - len(tgt_item) - 1, len(target_item)):
+                constraint_prefix_token = [
+                    self.tgt_dict.bos()
+                ] + target_item[start_idx:i].tolist()
+                constraint_nodes = self.constraint_trie.get_next_layer(
+                    constraint_prefix_token)
+                constraint_mask[i][constraint_nodes] = True
+            sample['constraint_mask'] = constraint_mask
+
+        return sample
+
+    def _build_infer_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        image = self.get_img_pil(data[self.column_map['image']])
         patch_image = self.patch_resize_transform(image)
         if 'text2' not in data:
             hypothesis = self.pre_caption(data['text'], self.max_src_length)
@@ -68,4 +111,7 @@ class OfaVisualEntailmentPreprocessor(OfaBasePreprocessor):
             'patch_mask': torch.tensor([True]),
             'decoder_prompt': decoder_prompt,
         }
+        if 'relation' in self.column_map and self.column_map[
+                'relation'] in data:
+            sample['label'] = data[self.column_map['relation']]
         return sample
diff --git a/modelscope/preprocessors/ofa/visual_grounding.py b/modelscope/preprocessors/ofa/visual_grounding.py
index 8b116463..c36517c1 100644
--- a/modelscope/preprocessors/ofa/visual_grounding.py
+++ b/modelscope/preprocessors/ofa/visual_grounding.py
@@ -1,6 +1,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Any, Dict
 
+import numpy as np
 import torch
 from PIL import Image
 from torchvision import transforms
@@ -27,24 +28,95 @@ class OfaVisualGroundingPreprocessor(OfaBasePreprocessor):
         """
         super(OfaVisualGroundingPreprocessor,
               self).__init__(cfg, model_dir, mode, *args, **kwargs)
-        # Initialize transform
-        self.patch_resize_transform = transforms.Compose([
-            lambda image: image.convert('RGB'),
-            transforms.Resize(
-                (self.patch_image_size, self.patch_image_size),
-                interpolation=transforms.InterpolationMode.BICUBIC),
-            transforms.ToTensor(),
-            transforms.Normalize(mean=self.mean, std=self.std),
-        ])
+
+        if self.mode == ModeKeys.TRAIN:
+            # for positioning
+            self.positioning_transform = transforms.Compose([
+                transforms.RandomResize([self.patch_image_size],
+                                        max_size=self.patch_image_size),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=self.mean,
+                    std=self.std,
+                    max_image_size=self.max_image_size)
+            ])
+        else:
+            # Initialize transform
+            self.patch_resize_transform = transforms.Compose([
+                lambda image: image.convert('RGB'),
+                transforms.Resize(
+                    (self.patch_image_size, self.patch_image_size),
+                    interpolation=transforms.InterpolationMode.BICUBIC),
+                transforms.ToTensor(),
+                transforms.Normalize(mean=self.mean, std=self.std),
+            ])
 
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
-        image = data['image'] if isinstance(
-            data['image'], Image.Image) else load_image(data['image'])
+        if self.mode == ModeKeys.TRAIN:
+            return self._build_train_sample(data)
+        else:
+            return self._build_infer_sample(data)
+
+    def _build_train_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        image = self.get_img_pil(data[self.column_map['image']])
+        w, h = image.size
+        b_tgt = {
+            'boxes': [],
+            'labels': [],
+            'area': [],
+            'size': torch.tensor([h, w])
+        }
+        x0, y0, x1, y1 = data[self.column_map['region_coord']].strip().split(
+            ',')
+        region = torch.tensor([float(x0), float(y0), float(x1), float(y1)])
+        b_tgt['boxes'] = torch.tensor(
+            [[float(x0), float(y0), float(x1),
+              float(y1)]])
+        b_tgt['labels'] = np.array([0])
+        b_tgt['area'] = [(float(x1) - float(x0)) * (float(y1) - float(y0))]
+
+        patch_image, patch_boxes = self.positioning_transform(image, b_tgt)
+        resize_h, resize_w = patch_boxes['size'][0], patch_boxes['size'][1]
+        quant_x0 = '<bin_{}>'.format(
+            int((patch_boxes['boxes'][0][0] * (self.num_bins - 1)).round()))
+        quant_y0 = '<bin_{}>'.format(
+            int((patch_boxes['boxes'][0][1] * (self.num_bins - 1)).round()))
+        quant_x1 = '<bin_{}>'.format(
+            int((patch_boxes['boxes'][0][2] * (self.num_bins - 1)).round()))
+        quant_y1 = '<bin_{}>'.format(
+            int((patch_boxes['boxes'][0][3] * (self.num_bins - 1)).round()))
+        region_coord = '{} {} {} {}'.format(quant_x0, quant_y0, quant_x1,
+                                            quant_y1)
+        src_caption = self.pre_caption(data[self.column_map['text']],
+                                       self.max_src_length)
+        prompt = self.cfg.model.get(
+            'prompt', ' which region does the text " {} " describe?')
+        text = prompt.format(src_caption)
+        src_item = self.tokenize_text(text)
+        target_item = self.tokenize_text(
+            region_coord, add_bos=False)  # !!! use_bpe=False
+        prev_output_item = torch.cat([self.bos_item, target_item[:-1]])
+
+        sample = {
+            'source': src_item,
+            'patch_image': patch_image,
+            'patch_mask': torch.tensor([True]),
+            'target': target_item,
+            'prev_output_tokens': prev_output_item,
+            'w_resize_ratio': resize_w / w,
+            'h_resize_ratio': resize_h / h,
+            'region_coord': region
+        }
+        return sample
+
+    def _build_infer_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        image = self.get_img_pil(data[self.column_map['image']])
         w, h = image.size
         patch_image = self.patch_resize_transform(image)
         w_resize_ratio = torch.tensor(self.patch_image_size / w)
         h_resize_ratio = torch.tensor(self.patch_image_size / h)
-        src_caption = self.pre_caption(data['text'], self.max_src_length)
+        src_caption = self.pre_caption(data[self.column_map['text']],
+                                       self.max_src_length)
         prompt = self.cfg.model.get(
             'prompt', ' which region does the text " {} " describe?')
         text = prompt.format(src_caption)
diff --git a/modelscope/preprocessors/ofa/visual_question_answering.py b/modelscope/preprocessors/ofa/visual_question_answering.py
index 11104e7e..9f9ea4f7 100644
--- a/modelscope/preprocessors/ofa/visual_question_answering.py
+++ b/modelscope/preprocessors/ofa/visual_question_answering.py
@@ -38,10 +38,70 @@ class OfaVisualQuestionAnsweringPreprocessor(OfaBasePreprocessor):
         ])
 
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
-        image = data['image'] if isinstance(
-            data['image'], Image.Image) else load_image(data['image'])
+        if self.mode == ModeKeys.TRAIN:
+            return self._build_train_sample(data)
+        else:
+            return self._build_infer_sample(data)
+
+    def _build_train_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        sample = self._build_infer_sample(data)
+        src_item = sample['source']
+        ref = data[self.column_map['ref']]
+        predict_objects = data[self.column_map['predict_objects']]
+
+        ref_dict = {
+            item.split('|!+')[1]: float(item.split('|!+')[0])
+            for item in ref.split('&&')
+        }
+        answer = max(ref_dict, key=ref_dict.get)
+        sample['conf'] = torch.tensor([ref_dict[answer]])
+        tgt_item = self.tokenize_text(
+            ' {}'.format(answer), add_bos=False, add_eos=False)
+
+        if self.add_object and predict_objects is not None:
+            predict_object_seq = ' '.join(
+                predict_objects.strip().split('&&')[:self.max_object_length])
+            predict_object_item = self.tokenize_text(
+                ' object: {}'.format(predict_object_seq), add_bos=False)
+            src_item = torch.cat([src_item, predict_object_item[:-1]])
+
+        if self.prompt_type == 'none':
+            prev_output_item = torch.cat([self.bos_item, tgt_item])
+            target_item = torch.cat([prev_output_item[1:], self.eos_item])
+        elif self.prompt_type == 'src':
+            prev_output_item = torch.cat([src_item, tgt_item])
+            target_item = torch.cat([prev_output_item[1:], self.eos_item])
+        elif self.prompt_type == 'prev_output':
+            prev_output_item = torch.cat([src_item[:-1], tgt_item])
+            target_item = torch.cat([prev_output_item[1:], self.eos_item])
+        else:
+            raise NotImplementedError
+        target_item[:-len(tgt_item) - 1] = self.tgt_dict.pad()
+
+        sample['prev_output_tokens'] = prev_output_item
+        sample['target'] = target_item
+        sample['ref_dict'] = ref_dict
+
+        if self.constraint_trie is not None:
+            constraint_mask = torch.zeros(
+                (len(target_item), len(self.tgt_dict))).bool()
+            start_idx = len(target_item) - len(tgt_item) - 1
+            for i in range(
+                    len(target_item) - len(tgt_item) - 1, len(target_item)):
+                constraint_prefix_token = [
+                    self.tgt_dict.bos()
+                ] + target_item[start_idx:i].tolist()
+                constraint_nodes = self.constraint_trie.get_next_layer(
+                    constraint_prefix_token)
+                constraint_mask[i][constraint_nodes] = True
+            sample['constraint_mask'] = constraint_mask
+
+        return sample
+
+    def _build_infer_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        image = self.get_img_pil(data[self.column_map['image']])
         patch_image = self.patch_resize_transform(image)
-        text = ' {}'.format(data['text'])
+        text = ' {}'.format(data[self.column_map['text']])
         inputs = self.tokenize_text(text)
         if self.prompt_type == 'none':
             decoder_prompt = self.bos_item
@@ -57,4 +117,6 @@ class OfaVisualQuestionAnsweringPreprocessor(OfaBasePreprocessor):
             'patch_mask': torch.tensor([True]),
             'decoder_prompt': decoder_prompt,
         }
+        if 'answer' in self.column_map and self.column_map['answer'] in data:
+            sample['label'] = data[self.column_map['answer']]
         return sample

From b889e64067a079c5f639ca01822eba73e9ab48bc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=BF=8E=E8=88=AA?= <liugao.lg@alibaba-inc.com>
Date: Tue, 1 Nov 2022 14:44:37 +0800
Subject: [PATCH 38/54] add task preprocess

---
 modelscope/preprocessors/ofa/visual_grounding.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/modelscope/preprocessors/ofa/visual_grounding.py b/modelscope/preprocessors/ofa/visual_grounding.py
index c36517c1..d9779fbe 100644
--- a/modelscope/preprocessors/ofa/visual_grounding.py
+++ b/modelscope/preprocessors/ofa/visual_grounding.py
@@ -60,7 +60,7 @@ class OfaVisualGroundingPreprocessor(OfaBasePreprocessor):
     def _build_train_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
         image = self.get_img_pil(data[self.column_map['image']])
         w, h = image.size
-        b_tgt = {
+        boxes_target = {
             'boxes': [],
             'labels': [],
             'area': [],
@@ -69,13 +69,15 @@ class OfaVisualGroundingPreprocessor(OfaBasePreprocessor):
         x0, y0, x1, y1 = data[self.column_map['region_coord']].strip().split(
             ',')
         region = torch.tensor([float(x0), float(y0), float(x1), float(y1)])
-        b_tgt['boxes'] = torch.tensor(
+        boxes_target['boxes'] = torch.tensor(
             [[float(x0), float(y0), float(x1),
               float(y1)]])
-        b_tgt['labels'] = np.array([0])
-        b_tgt['area'] = [(float(x1) - float(x0)) * (float(y1) - float(y0))]
+        boxes_target['labels'] = np.array([0])
+        area = [(float(x1) - float(x0)) * (float(y1) - float(y0))]
+        boxes_target['area'] = torch.tensor(area)
 
-        patch_image, patch_boxes = self.positioning_transform(image, b_tgt)
+        patch_image, patch_boxes = self.positioning_transform(
+            image, boxes_target)
         resize_h, resize_w = patch_boxes['size'][0], patch_boxes['size'][1]
         quant_x0 = '<bin_{}>'.format(
             int((patch_boxes['boxes'][0][0] * (self.num_bins - 1)).round()))

From ca946067e643dac1fd9920eb7bfd53c5cafbe320 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=BF=8E=E8=88=AA?= <liugao.lg@alibaba-inc.com>
Date: Wed, 2 Nov 2022 16:15:32 +0800
Subject: [PATCH 39/54] fix finetune-task

---
 modelscope/preprocessors/ofa/summarization.py |  3 +-
 .../preprocessors/ofa/visual_entailment.py    | 11 +++++---
 .../ofa/visual_question_answering.py          | 28 ++++---------------
 3 files changed, 14 insertions(+), 28 deletions(-)

diff --git a/modelscope/preprocessors/ofa/summarization.py b/modelscope/preprocessors/ofa/summarization.py
index 176600a9..8568a543 100644
--- a/modelscope/preprocessors/ofa/summarization.py
+++ b/modelscope/preprocessors/ofa/summarization.py
@@ -72,6 +72,7 @@ class OfaSummarizationPreprocessor(OfaBasePreprocessor):
                 'noise_ratio', 0.0)
         target[noise_indices] = torch.randint(
             4,
-            len(self.src_dict) - self.code_dict_size - self.num_bins,
+            len(self.src_dict) - self.cfg.model.get('num_codes', 8192)
+            - self.cfg.model.get('num_bins', 1000),
             size=(noise_indices.sum(), ))
         return target
diff --git a/modelscope/preprocessors/ofa/visual_entailment.py b/modelscope/preprocessors/ofa/visual_entailment.py
index aeba199c..fff5bbd3 100644
--- a/modelscope/preprocessors/ofa/visual_entailment.py
+++ b/modelscope/preprocessors/ofa/visual_entailment.py
@@ -61,7 +61,7 @@ class OfaVisualEntailmentPreprocessor(OfaBasePreprocessor):
         else:
             raise NotImplementedError
 
-        target_item[:-len(tgt_item) - 1] = self.tgt_dict.pad()
+        target_item[:-len(tgt_item) - 1] = self.tokenizer.pad_token_id
         sample['target'] = target_item
         sample['prev_output_tokens'] = prev_output_item
 
@@ -85,14 +85,17 @@ class OfaVisualEntailmentPreprocessor(OfaBasePreprocessor):
         image = self.get_img_pil(data[self.column_map['image']])
         patch_image = self.patch_resize_transform(image)
         if 'text2' not in data:
-            hypothesis = self.pre_caption(data['text'], self.max_src_length)
+            hypothesis = self.pre_caption(data[self.column_map['text']],
+                                          self.max_src_length)
             prompt = self.cfg.model.get('prompt',
                                         ' does the image describe " {} "?')
             text = prompt.format(hypothesis)
         else:
             assert 'text' in data, f'text must be in the input {data.keys()}'
-            caption = self.pre_caption(data['text2'], self.max_src_length)
-            hypothesis = self.pre_caption(data['text'], self.max_src_length)
+            caption = self.pre_caption(data[self.column_map['text2']],
+                                       self.max_src_length)
+            hypothesis = self.pre_caption(data[self.column_map['text']],
+                                          self.max_src_length)
             prompt = self.cfg.model.get(
                 'prompt', ' can image and text1 " {} " imply text2 " {} "?')
             text = prompt.format(caption, hypothesis)
diff --git a/modelscope/preprocessors/ofa/visual_question_answering.py b/modelscope/preprocessors/ofa/visual_question_answering.py
index 9f9ea4f7..c623a869 100644
--- a/modelscope/preprocessors/ofa/visual_question_answering.py
+++ b/modelscope/preprocessors/ofa/visual_question_answering.py
@@ -45,42 +45,24 @@ class OfaVisualQuestionAnsweringPreprocessor(OfaBasePreprocessor):
 
     def _build_train_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
         sample = self._build_infer_sample(data)
-        src_item = sample['source']
-        ref = data[self.column_map['ref']]
-        predict_objects = data[self.column_map['predict_objects']]
-
-        ref_dict = {
-            item.split('|!+')[1]: float(item.split('|!+')[0])
-            for item in ref.split('&&')
-        }
-        answer = max(ref_dict, key=ref_dict.get)
-        sample['conf'] = torch.tensor([ref_dict[answer]])
         tgt_item = self.tokenize_text(
-            ' {}'.format(answer), add_bos=False, add_eos=False)
-
-        if self.add_object and predict_objects is not None:
-            predict_object_seq = ' '.join(
-                predict_objects.strip().split('&&')[:self.max_object_length])
-            predict_object_item = self.tokenize_text(
-                ' object: {}'.format(predict_object_seq), add_bos=False)
-            src_item = torch.cat([src_item, predict_object_item[:-1]])
+            ' {}'.format(sample['label']), add_bos=False, add_eos=False)
 
         if self.prompt_type == 'none':
             prev_output_item = torch.cat([self.bos_item, tgt_item])
             target_item = torch.cat([prev_output_item[1:], self.eos_item])
         elif self.prompt_type == 'src':
-            prev_output_item = torch.cat([src_item, tgt_item])
+            prev_output_item = torch.cat([sample['source'], tgt_item])
             target_item = torch.cat([prev_output_item[1:], self.eos_item])
         elif self.prompt_type == 'prev_output':
-            prev_output_item = torch.cat([src_item[:-1], tgt_item])
+            prev_output_item = torch.cat([sample['source'][:-1], tgt_item])
             target_item = torch.cat([prev_output_item[1:], self.eos_item])
         else:
             raise NotImplementedError
-        target_item[:-len(tgt_item) - 1] = self.tgt_dict.pad()
+        target_item[:-len(tgt_item) - 1] = self.tokenizer.pad_token_id
 
         sample['prev_output_tokens'] = prev_output_item
         sample['target'] = target_item
-        sample['ref_dict'] = ref_dict
 
         if self.constraint_trie is not None:
             constraint_mask = torch.zeros(
@@ -101,7 +83,7 @@ class OfaVisualQuestionAnsweringPreprocessor(OfaBasePreprocessor):
     def _build_infer_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
         image = self.get_img_pil(data[self.column_map['image']])
         patch_image = self.patch_resize_transform(image)
-        text = ' {}'.format(data[self.column_map['text']])
+        text = ' {}'.format(data[self.column_map['query']])
         inputs = self.tokenize_text(text)
         if self.prompt_type == 'none':
             decoder_prompt = self.bos_item

From 85f4d6b326b460a7164c3b88e26450c42c3a9829 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=BF=8E=E8=88=AA?= <liugao.lg@alibaba-inc.com>
Date: Wed, 2 Nov 2022 17:12:41 +0800
Subject: [PATCH 40/54] fix finetune-task

---
 modelscope/preprocessors/ofa/visual_question_answering.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modelscope/preprocessors/ofa/visual_question_answering.py b/modelscope/preprocessors/ofa/visual_question_answering.py
index c623a869..b83cf935 100644
--- a/modelscope/preprocessors/ofa/visual_question_answering.py
+++ b/modelscope/preprocessors/ofa/visual_question_answering.py
@@ -83,7 +83,7 @@ class OfaVisualQuestionAnsweringPreprocessor(OfaBasePreprocessor):
     def _build_infer_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
         image = self.get_img_pil(data[self.column_map['image']])
         patch_image = self.patch_resize_transform(image)
-        text = ' {}'.format(data[self.column_map['query']])
+        text = ' {}'.format(data[self.column_map['text']])
         inputs = self.tokenize_text(text)
         if self.prompt_type == 'none':
             decoder_prompt = self.bos_item

From 84f6de09ea705706d535a7b509b9d0d20e912a07 Mon Sep 17 00:00:00 2001
From: "bin.xue" <bin.xue@alibaba-inc.com>
Date: Wed, 2 Nov 2022 19:05:02 +0800
Subject: [PATCH 41/54] feat: add argument for changing model output dimension

---
 modelscope/trainers/audio/kws_farfield_trainer.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/modelscope/trainers/audio/kws_farfield_trainer.py b/modelscope/trainers/audio/kws_farfield_trainer.py
index a720ced5..85c1a496 100644
--- a/modelscope/trainers/audio/kws_farfield_trainer.py
+++ b/modelscope/trainers/audio/kws_farfield_trainer.py
@@ -69,11 +69,14 @@ class KWSFarfieldTrainer(BaseTrainer):
 
         super().__init__(cfg_file, arg_parse_fn)
 
-        self.model = self.build_model()
-        self.work_dir = work_dir
         # the number of model output dimension
         # should update config outside the trainer, if user need more wake word
+        num_syn = kwargs.get('num_syn', None)
+        if num_syn:
+            self.cfg.model.num_syn = num_syn
         self._num_classes = self.cfg.model.num_syn
+        self.model = self.build_model()
+        self.work_dir = work_dir
 
         if kwargs.get('launcher', None) is not None:
             init_dist(kwargs['launcher'])

From 3f75fcdb79804f1553846b66b1749ee1a542217e Mon Sep 17 00:00:00 2001
From: yzhao <yuze.zyz@alibaba-inc.com>
Date: Wed, 2 Nov 2022 20:02:18 +0800
Subject: [PATCH 42/54] fix bug

---
 .../nlp/text_classification_pipeline.py        | 18 ++++++++++--------
 modelscope/preprocessors/base.py               |  4 +++-
 modelscope/utils/regress_test_utils.py         | 18 ++++++++++++++++++
 tests/pipelines/test_fill_mask.py              | 10 +++++++---
 tests/pipelines/test_nli.py                    |  7 ++++---
 tests/pipelines/test_sentence_similarity.py    |  6 ++++--
 tests/pipelines/test_word_segmentation.py      | 10 +++++++---
 .../pipelines/test_zero_shot_classification.py |  6 ++++--
 8 files changed, 57 insertions(+), 22 deletions(-)

diff --git a/modelscope/pipelines/nlp/text_classification_pipeline.py b/modelscope/pipelines/nlp/text_classification_pipeline.py
index 9e00ad7f..771660a5 100644
--- a/modelscope/pipelines/nlp/text_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/text_classification_pipeline.py
@@ -3,14 +3,13 @@ from typing import Any, Dict, Union
 
 import numpy as np
 
-from modelscope.metainfo import Pipelines
+from modelscope.metainfo import Pipelines, Preprocessors
 from modelscope.models.base import Model
-from modelscope.models.multi_modal import OfaForAllTasks
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import OfaPreprocessor, Preprocessor
-from modelscope.utils.constant import Tasks
+from modelscope.preprocessors import Preprocessor
+from modelscope.utils.constant import Fields, Tasks
 
 
 @PIPELINES.register_module(
@@ -58,8 +57,11 @@ class TextClassificationPipeline(Pipeline):
                                                            str) else model
 
         if preprocessor is None:
-            if isinstance(model, OfaForAllTasks):
-                preprocessor = OfaPreprocessor(model_dir=model.model_dir)
+            if model.__class__.__name__ == 'OfaForAllTasks':
+                preprocessor = Preprocessor.from_pretrained(
+                    model_name_or_path=model.model_dir,
+                    type=Preprocessors.ofa_tasks_preprocessor,
+                    field=Fields.multi_modal)
             else:
                 first_sequence = kwargs.pop('first_sequence', 'first_sequence')
                 second_sequence = kwargs.pop('second_sequence', None)
@@ -76,7 +78,7 @@ class TextClassificationPipeline(Pipeline):
 
     def forward(self, inputs: Dict[str, Any],
                 **forward_params) -> Dict[str, Any]:
-        if isinstance(self.model, OfaForAllTasks):
+        if self.model.__class__.__name__ == 'OfaForAllTasks':
             return super().forward(inputs, **forward_params)
         return self.model(**inputs, **forward_params)
 
@@ -95,7 +97,7 @@ class TextClassificationPipeline(Pipeline):
                 labels: The real labels.
             Label at index 0 is the smallest probability.
         """
-        if isinstance(self.model, OfaForAllTasks):
+        if self.model.__class__.__name__ == 'OfaForAllTasks':
             return inputs
         else:
             assert self.id2label is not None, 'Cannot convert id to the original label, please pass in the mapping ' \
diff --git a/modelscope/preprocessors/base.py b/modelscope/preprocessors/base.py
index be62ebb4..eb790f84 100644
--- a/modelscope/preprocessors/base.py
+++ b/modelscope/preprocessors/base.py
@@ -205,9 +205,11 @@ class Preprocessor(ABC):
         if 'task' in kwargs:
             task = kwargs.pop('task')
         field_name = Tasks.find_field_by_task(task)
+        if 'field' in kwargs:
+            field_name = kwargs.pop('field')
         sub_key = 'train' if preprocessor_mode == ModeKeys.TRAIN else 'val'
 
-        if not hasattr(cfg, 'preprocessor'):
+        if not hasattr(cfg, 'preprocessor') or len(cfg.preprocessor) == 0:
             logger.error('No preprocessor field found in cfg.')
             preprocessor_cfg = ConfigDict()
         else:
diff --git a/modelscope/utils/regress_test_utils.py b/modelscope/utils/regress_test_utils.py
index be983c6c..58b5b1a3 100644
--- a/modelscope/utils/regress_test_utils.py
+++ b/modelscope/utils/regress_test_utils.py
@@ -5,6 +5,7 @@ import hashlib
 import os
 import pickle
 import random
+import re
 import shutil
 import tempfile
 from collections import OrderedDict
@@ -759,3 +760,20 @@ def compare_cfg_and_optimizers(baseline_json,
                                          state2, **kwargs) and match
 
     return match
+
+
+class IgnoreKeyFn:
+
+    def __init__(self, keys):
+        if isinstance(keys, str):
+            keys = [keys]
+        self.keys = keys if isinstance(keys, list) else []
+
+    def __call__(self, v1output, v2output, key, type):
+        if key == 'encoder.encoder.layer.0.intermediate.intermediate_act_fn':
+            print()
+        for _key in self.keys:
+            pattern = re.compile(_key)
+            if key is not None and pattern.fullmatch(key):
+                return True
+        return None
diff --git a/tests/pipelines/test_fill_mask.py b/tests/pipelines/test_fill_mask.py
index 35202b88..64833026 100644
--- a/tests/pipelines/test_fill_mask.py
+++ b/tests/pipelines/test_fill_mask.py
@@ -11,7 +11,7 @@ from modelscope.pipelines.nlp import FillMaskPipeline
 from modelscope.preprocessors import NLPPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.demo_utils import DemoCompatibilityCheck
-from modelscope.utils.regress_test_utils import MsRegressTool
+from modelscope.utils.regress_test_utils import IgnoreKeyFn, MsRegressTool
 from modelscope.utils.test_utils import test_level
 
 
@@ -109,7 +109,9 @@ class FillMaskTest(unittest.TestCase, DemoCompatibilityCheck):
             pipeline_ins = pipeline(
                 task=Tasks.fill_mask, model=model, preprocessor=preprocessor)
             with self.regress_tool.monitor_module_single_forward(
-                    pipeline_ins.model, f'fill_mask_sbert_{language}'):
+                    pipeline_ins.model,
+                    f'fill_mask_sbert_{language}',
+                    compare_fn=IgnoreKeyFn('.*intermediate_act_fn')):
                 print(
                     f'\nori_text: {self.ori_texts[language]}\ninput: {self.test_inputs[language]}\npipeline: '
                     f'{pipeline_ins(self.test_inputs[language])}\n')
@@ -124,7 +126,9 @@ class FillMaskTest(unittest.TestCase, DemoCompatibilityCheck):
             ori_text = self.ori_texts[language]
             test_input = self.test_inputs[language].replace('[MASK]', '<mask>')
             with self.regress_tool.monitor_module_single_forward(
-                    pipeline_ins.model, f'fill_mask_veco_{language}'):
+                    pipeline_ins.model,
+                    f'fill_mask_veco_{language}',
+                    compare_fn=IgnoreKeyFn('.*intermediate_act_fn')):
                 print(
                     f'\nori_text: {ori_text}\ninput: {test_input}\npipeline: '
                     f'{pipeline_ins(test_input)}\n')
diff --git a/tests/pipelines/test_nli.py b/tests/pipelines/test_nli.py
index 5f2dcb25..9e9fefea 100644
--- a/tests/pipelines/test_nli.py
+++ b/tests/pipelines/test_nli.py
@@ -3,13 +3,12 @@ import unittest
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
-from modelscope.models.nlp import SbertForSequenceClassification
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import TextClassificationPipeline
 from modelscope.preprocessors import SequenceClassificationPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.demo_utils import DemoCompatibilityCheck
-from modelscope.utils.regress_test_utils import MsRegressTool
+from modelscope.utils.regress_test_utils import IgnoreKeyFn, MsRegressTool
 from modelscope.utils.test_utils import test_level
 
 
@@ -48,7 +47,9 @@ class NLITest(unittest.TestCase, DemoCompatibilityCheck):
     def test_run_with_model_name(self):
         pipeline_ins = pipeline(task=Tasks.nli, model=self.model_id)
         with self.regress_tool.monitor_module_single_forward(
-                pipeline_ins.model, 'sbert_nli'):
+                pipeline_ins.model,
+                'sbert_nli',
+                compare_fn=IgnoreKeyFn('.*intermediate_act_fn')):
             print(pipeline_ins(input=(self.sentence1, self.sentence2)))
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
diff --git a/tests/pipelines/test_sentence_similarity.py b/tests/pipelines/test_sentence_similarity.py
index 76db0a8f..904caea3 100644
--- a/tests/pipelines/test_sentence_similarity.py
+++ b/tests/pipelines/test_sentence_similarity.py
@@ -9,7 +9,7 @@ from modelscope.pipelines.nlp import TextClassificationPipeline
 from modelscope.preprocessors import SequenceClassificationPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.demo_utils import DemoCompatibilityCheck
-from modelscope.utils.regress_test_utils import MsRegressTool
+from modelscope.utils.regress_test_utils import IgnoreKeyFn, MsRegressTool
 from modelscope.utils.test_utils import test_level
 
 
@@ -54,7 +54,9 @@ class SentenceSimilarityTest(unittest.TestCase, DemoCompatibilityCheck):
         pipeline_ins = pipeline(
             task=Tasks.sentence_similarity, model=self.model_id)
         with self.regress_tool.monitor_module_single_forward(
-                pipeline_ins.model, 'sbert_sen_sim'):
+                pipeline_ins.model,
+                'sbert_sen_sim',
+                compare_fn=IgnoreKeyFn('.*intermediate_act_fn')):
             print(pipeline_ins(input=(self.sentence1, self.sentence2)))
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
diff --git a/tests/pipelines/test_word_segmentation.py b/tests/pipelines/test_word_segmentation.py
index cd01b98f..6969c0e6 100644
--- a/tests/pipelines/test_word_segmentation.py
+++ b/tests/pipelines/test_word_segmentation.py
@@ -9,7 +9,7 @@ from modelscope.pipelines.nlp import WordSegmentationPipeline
 from modelscope.preprocessors import TokenClassificationPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.demo_utils import DemoCompatibilityCheck
-from modelscope.utils.regress_test_utils import MsRegressTool
+from modelscope.utils.regress_test_utils import IgnoreKeyFn, MsRegressTool
 from modelscope.utils.test_utils import test_level
 
 
@@ -48,10 +48,14 @@ class WordSegmentationTest(unittest.TestCase, DemoCompatibilityCheck):
         pipeline_ins = pipeline(
             task=Tasks.word_segmentation, model=self.model_id)
         with self.regress_tool.monitor_module_single_forward(
-                pipeline_ins.model, 'sbert_ws_zh'):
+                pipeline_ins.model,
+                'sbert_ws_zh',
+                compare_fn=IgnoreKeyFn('.*intermediate_act_fn')):
             print(pipeline_ins(input=self.sentence))
         with self.regress_tool.monitor_module_single_forward(
-                pipeline_ins.model, 'sbert_ws_en'):
+                pipeline_ins.model,
+                'sbert_ws_en',
+                compare_fn=IgnoreKeyFn('.*intermediate_act_fn')):
             print(pipeline_ins(input=self.sentence_eng))
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
diff --git a/tests/pipelines/test_zero_shot_classification.py b/tests/pipelines/test_zero_shot_classification.py
index 6a98132a..00789707 100644
--- a/tests/pipelines/test_zero_shot_classification.py
+++ b/tests/pipelines/test_zero_shot_classification.py
@@ -9,7 +9,7 @@ from modelscope.pipelines.nlp import ZeroShotClassificationPipeline
 from modelscope.preprocessors import ZeroShotClassificationPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.demo_utils import DemoCompatibilityCheck
-from modelscope.utils.regress_test_utils import MsRegressTool
+from modelscope.utils.regress_test_utils import IgnoreKeyFn, MsRegressTool
 from modelscope.utils.test_utils import test_level
 
 
@@ -65,7 +65,9 @@ class ZeroShotClassificationTest(unittest.TestCase, DemoCompatibilityCheck):
         pipeline_ins = pipeline(
             task=Tasks.zero_shot_classification, model=self.model_id)
         with self.regress_tool.monitor_module_single_forward(
-                pipeline_ins.model, 'sbert_zero_shot'):
+                pipeline_ins.model,
+                'sbert_zero_shot',
+                compare_fn=IgnoreKeyFn('.*intermediate_act_fn')):
             print(
                 pipeline_ins(
                     input=self.sentence, candidate_labels=self.labels))

From 89a95f7f76afb594970dbe13bbf6cc00343966ec Mon Sep 17 00:00:00 2001
From: yzhao <yuze.zyz@alibaba-inc.com>
Date: Thu, 3 Nov 2022 10:21:38 +0800
Subject: [PATCH 43/54] change error log to warn log

---
 modelscope/preprocessors/base.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/modelscope/preprocessors/base.py b/modelscope/preprocessors/base.py
index eb790f84..38500561 100644
--- a/modelscope/preprocessors/base.py
+++ b/modelscope/preprocessors/base.py
@@ -210,7 +210,7 @@ class Preprocessor(ABC):
         sub_key = 'train' if preprocessor_mode == ModeKeys.TRAIN else 'val'
 
         if not hasattr(cfg, 'preprocessor') or len(cfg.preprocessor) == 0:
-            logger.error('No preprocessor field found in cfg.')
+            logger.warn('No preprocessor field found in cfg.')
             preprocessor_cfg = ConfigDict()
         else:
             preprocessor_cfg = cfg.preprocessor
@@ -219,9 +219,8 @@ class Preprocessor(ABC):
             if sub_key in preprocessor_cfg:
                 sub_cfg = getattr(preprocessor_cfg, sub_key)
             else:
-                logger.error(
-                    f'No {sub_key} key and type key found in '
-                    f'preprocessor domain of configuration.json file.')
+                logger.warn(f'No {sub_key} key and type key found in '
+                            f'preprocessor domain of configuration.json file.')
                 sub_cfg = preprocessor_cfg
         else:
             sub_cfg = preprocessor_cfg
@@ -237,7 +236,7 @@ class Preprocessor(ABC):
 
             preprocessor = build_preprocessor(sub_cfg, field_name)
         else:
-            logger.error(
+            logger.warn(
                 f'Cannot find available config to build preprocessor at mode {preprocessor_mode}, '
                 f'current config: {sub_cfg}. trying to build by task and model information.'
             )
@@ -245,13 +244,13 @@ class Preprocessor(ABC):
             model_type = model_cfg.type if hasattr(
                 model_cfg, 'type') else getattr(model_cfg, 'model_type', None)
             if task is None or model_type is None:
-                logger.error(
+                logger.warn(
                     f'Find task: {task}, model type: {model_type}. '
                     f'Insufficient information to build preprocessor, skip building preprocessor'
                 )
                 return None
             if (model_type, task) not in PREPROCESSOR_MAP:
-                logger.error(
+                logger.warn(
                     f'No preprocessor key {(model_type, task)} found in PREPROCESSOR_MAP, '
                     f'skip building preprocessor.')
                 return None

From 87fcd28c4c81c91b587a5a92caaf01fd89034b97 Mon Sep 17 00:00:00 2001
From: "mulin.lyh" <mulin.lyh@taobao.com>
Date: Fri, 4 Nov 2022 09:11:51 +0800
Subject: [PATCH 44/54] add ci tag to cicases

---
 .dev_scripts/dockerci.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.dev_scripts/dockerci.sh b/.dev_scripts/dockerci.sh
index c502175b..07ea947a 100644
--- a/.dev_scripts/dockerci.sh
+++ b/.dev_scripts/dockerci.sh
@@ -37,6 +37,7 @@ do
               -e TEST_ACCESS_TOKEN_CITEST=$TEST_ACCESS_TOKEN_CITEST \
               -e TEST_ACCESS_TOKEN_SDKDEV=$TEST_ACCESS_TOKEN_SDKDEV \
               -e TEST_LEVEL=$TEST_LEVEL \
+              -e MODELSCOPE_ENVIRONMENT='ci' \
               -e TEST_UPLOAD_MS_TOKEN=$TEST_UPLOAD_MS_TOKEN \
               -e MODEL_TAG_URL=$MODEL_TAG_URL \
               --workdir=$CODE_DIR_IN_CONTAINER \
@@ -59,6 +60,7 @@ do
               -e TEST_ACCESS_TOKEN_CITEST=$TEST_ACCESS_TOKEN_CITEST \
               -e TEST_ACCESS_TOKEN_SDKDEV=$TEST_ACCESS_TOKEN_SDKDEV \
               -e TEST_LEVEL=$TEST_LEVEL \
+              -e MODELSCOPE_ENVIRONMENT='ci' \
               -e TEST_UPLOAD_MS_TOKEN=$TEST_UPLOAD_MS_TOKEN \
               -e MODEL_TAG_URL=$MODEL_TAG_URL \
               --workdir=$CODE_DIR_IN_CONTAINER \

From e1dd9964604e7a50e024c72db81c0fec08426671 Mon Sep 17 00:00:00 2001
From: "bin.xue" <bin.xue@alibaba-inc.com>
Date: Fri, 4 Nov 2022 13:24:40 +0800
Subject: [PATCH 45/54] fix: failed to update sc_config_file concurrently

---
 modelscope/models/audio/kws/farfield/model.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/modelscope/models/audio/kws/farfield/model.py b/modelscope/models/audio/kws/farfield/model.py
index d63d1e2a..af1c0a27 100644
--- a/modelscope/models/audio/kws/farfield/model.py
+++ b/modelscope/models/audio/kws/farfield/model.py
@@ -1,6 +1,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
 import os
+import tempfile
 from typing import Dict, Optional
 
 from modelscope.metainfo import Models
@@ -36,12 +37,15 @@ class FSMNSeleNetV2Decorator(TorchModel):
         else:
             sc_config_file = os.path.join(model_dir, self.SC_CONFIG)
             model_txt_file = os.path.join(model_dir, self.MODEL_TXT)
+            self.tmp_dir = tempfile.TemporaryDirectory()
+            new_config_file = os.path.join(self.tmp_dir.name, self.SC_CONFIG)
+
             self._sc = None
             if os.path.exists(model_txt_file):
                 conf_dict = dict(mode=56542, kws_model=model_txt_file)
-                update_conf(sc_config_file, sc_config_file, conf_dict)
+                update_conf(sc_config_file, new_config_file, conf_dict)
                 import py_sound_connect
-                self._sc = py_sound_connect.SoundConnect(sc_config_file)
+                self._sc = py_sound_connect.SoundConnect(new_config_file)
                 self.size_in = self._sc.bytesPerBlockIn()
                 self.size_out = self._sc.bytesPerBlockOut()
             else:
@@ -49,6 +53,9 @@ class FSMNSeleNetV2Decorator(TorchModel):
                     f'Invalid model directory! Failed to load model file: {model_txt_file}.'
                 )
 
+    def __del__(self):
+        self.tmp_dir.cleanup()
+
     def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
         return self.model.forward(input)
 

From 26db21d57db6cb1b942c568dd36d4fa054ffb81d Mon Sep 17 00:00:00 2001
From: "hanyuan.chy" <hanyuan.chy@alibaba-inc.com>
Date: Fri, 4 Nov 2022 09:33:09 +0000
Subject: [PATCH 46/54] fix output video path when person detect failed.

---
 modelscope/pipelines/cv/body_3d_keypoints_pipeline.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py b/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py
index 8522ceff..d113fb3c 100644
--- a/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py
+++ b/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py
@@ -132,8 +132,8 @@ class Body3DKeypointsPipeline(Pipeline):
             device='gpu' if torch.cuda.is_available() else 'cpu')
 
     def preprocess(self, input: Input) -> Dict[str, Any]:
-        video_url = input
-        video_frames = self.read_video_frames(video_url)
+        self.video_url = input
+        video_frames = self.read_video_frames(self.video_url)
         if 0 == len(video_frames):
             res = {'success': False, 'msg': 'get video frame failed.'}
             return res
@@ -198,7 +198,7 @@ class Body3DKeypointsPipeline(Pipeline):
         }
 
         if not input['success']:
-            pass
+            res[OutputKeys.OUTPUT_VIDEO] = self.video_url
         else:
             poses = input[KeypointsTypes.POSES_CAMERA]
             pred_3d_pose = poses.data.cpu().numpy()[

From 0418786cbeccaa030753a95b4d7ace92b0220a20 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=BF=8E=E8=88=AA?= <liugao.lg@alibaba-inc.com>
Date: Mon, 7 Nov 2022 20:23:17 +0800
Subject: [PATCH 47/54] add five task finetune

---
 modelscope/metainfo.py                        |  1 +
 modelscope/metrics/builder.py                 |  1 +
 modelscope/metrics/map_metric.py              | 67 +++++++++++++++++++
 .../preprocessors/ofa/visual_grounding.py     | 18 +++--
 .../trainers/multi_modal/ofa/ofa_trainer.py   | 17 +++--
 tests/trainers/test_ofa_trainer.py            | 11 +--
 6 files changed, 101 insertions(+), 14 deletions(-)
 create mode 100644 modelscope/metrics/map_metric.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 8c9964b8..2df6f2a0 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -402,6 +402,7 @@ class Metrics(object):
 
     # accuracy
     accuracy = 'accuracy'
+    multi_average_precision = 'mAP'
     audio_noise_metric = 'audio-noise-metric'
 
     # text gen
diff --git a/modelscope/metrics/builder.py b/modelscope/metrics/builder.py
index b9e402c5..e2fe67f8 100644
--- a/modelscope/metrics/builder.py
+++ b/modelscope/metrics/builder.py
@@ -24,6 +24,7 @@ class MetricKeys(object):
     ROUGE_1 = 'rouge-1'
     ROUGE_L = 'rouge-l'
     NED = 'ned'  # ocr metric
+    mAP = 'mAP'
     BatchAcc = 'inbatch_t2i_recall_at_1'
 
 
diff --git a/modelscope/metrics/map_metric.py b/modelscope/metrics/map_metric.py
new file mode 100644
index 00000000..aac76f22
--- /dev/null
+++ b/modelscope/metrics/map_metric.py
@@ -0,0 +1,67 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Dict
+
+import numpy as np
+
+from modelscope.metainfo import Metrics
+from modelscope.outputs import OutputKeys
+from modelscope.utils.registry import default_group
+from .base import Metric
+from .builder import METRICS, MetricKeys
+
+
+@METRICS.register_module(
+    group_key=default_group, module_name=Metrics.multi_average_precision)
+class AveragePrecisionMetric(Metric):
+    """The metric computation class for multi avarage precision classes.
+
+    This metric class calculates multi avarage precision for the whole input batches.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.preds = []
+        self.labels = []
+        self.thresh = kwargs.get('threshold', 0.5)
+
+    def add(self, outputs: Dict, inputs: Dict):
+        label_name = OutputKeys.LABEL if OutputKeys.LABEL in inputs else OutputKeys.LABELS
+        ground_truths = inputs[label_name]
+        eval_results = outputs[label_name]
+        for key in [
+                OutputKeys.CAPTION, OutputKeys.TEXT, OutputKeys.BOXES,
+                OutputKeys.LABELS, OutputKeys.SCORES
+        ]:
+            if key in outputs and outputs[key] is not None:
+                eval_results = outputs[key]
+                break
+        assert type(ground_truths) == type(eval_results)
+        for truth in ground_truths:
+            self.labels.append(truth)
+        for result in eval_results:
+            if isinstance(truth, str):
+                self.preds.append(result.strip().replace(' ', ''))
+            else:
+                self.preds.append(result)
+
+    def evaluate(self):
+        assert len(self.preds) == len(self.labels)
+        scores = self._calculate_ap_score(self.preds, self.labels, self.thresh)
+        return {MetricKeys.mAP: scores.mean().item()}
+
+    def _calculate_ap_score(self, preds, labels, thresh=0.5):
+        hyps = np.array(preds)
+        refs = np.array(labels)
+        a = np.where(hyps[:, :2] < refs[:, :2], refs[:, :2], hyps[:, :2])
+        b = np.where(hyps[:, 2:] < refs[:, 2:], hyps[:, 2:], refs[:, 2:])
+        interacts = np.concatenate([a, b], axis=1)
+        area_predictions = (hyps[:, 2] - hyps[:, 0]) * (
+            hyps[:, 3] - hyps[:, 1])
+        area_targets = (refs[:, 2] - refs[:, 0]) * (refs[:, 3] - refs[:, 1])
+        interacts_w = interacts[:, 2] - interacts[:, 0]
+        interacts_h = interacts[:, 3] - interacts[:, 1]
+        area_interacts = interacts_w * interacts_h
+        ious = area_interacts / (
+            area_predictions + area_targets - area_interacts + 1e-6)
+        return (ious >= thresh) & (interacts_w > 0) & (interacts_h > 0)
diff --git a/modelscope/preprocessors/ofa/visual_grounding.py b/modelscope/preprocessors/ofa/visual_grounding.py
index d9779fbe..2da79670 100644
--- a/modelscope/preprocessors/ofa/visual_grounding.py
+++ b/modelscope/preprocessors/ofa/visual_grounding.py
@@ -9,6 +9,7 @@ from torchvision import transforms
 from modelscope.preprocessors.image import load_image
 from modelscope.utils.constant import ModeKeys
 from .base import OfaBasePreprocessor
+from .utils import transforms as T
 
 
 class OfaVisualGroundingPreprocessor(OfaBasePreprocessor):
@@ -29,13 +30,14 @@ class OfaVisualGroundingPreprocessor(OfaBasePreprocessor):
         super(OfaVisualGroundingPreprocessor,
               self).__init__(cfg, model_dir, mode, *args, **kwargs)
 
+        self.num_bins = self.cfg.model.get('num_bins', 1000)
         if self.mode == ModeKeys.TRAIN:
             # for positioning
-            self.positioning_transform = transforms.Compose([
-                transforms.RandomResize([self.patch_image_size],
-                                        max_size=self.patch_image_size),
-                transforms.ToTensor(),
-                transforms.Normalize(
+            self.positioning_transform = T.Compose([
+                T.RandomResize([self.patch_image_size],
+                               max_size=self.patch_image_size),
+                T.ToTensor(),
+                T.Normalize(
                     mean=self.mean,
                     std=self.std,
                     max_image_size=self.max_image_size)
@@ -130,4 +132,10 @@ class OfaVisualGroundingPreprocessor(OfaBasePreprocessor):
             'w_resize_ratio': w_resize_ratio,
             'h_resize_ratio': h_resize_ratio,
         }
+
+        if 'region_coord' in self.column_map and self.column_map[
+                'region_coord'] in data:
+            x0, y0, x1, y1 = data[
+                self.column_map['region_coord']].strip().split(',')
+            sample['label'] = [float(x0), float(y0), float(x1), float(y1)]
         return sample
diff --git a/modelscope/trainers/multi_modal/ofa/ofa_trainer.py b/modelscope/trainers/multi_modal/ofa/ofa_trainer.py
index f8028c6c..71494768 100644
--- a/modelscope/trainers/multi_modal/ofa/ofa_trainer.py
+++ b/modelscope/trainers/multi_modal/ofa/ofa_trainer.py
@@ -34,6 +34,7 @@ class OFATrainer(EpochBasedTrainer):
             self,
             model: Optional[Union[TorchModel, nn.Module, str]] = None,
             cfg_file: Optional[str] = None,
+            cfg_modify_fn: Optional[Callable] = None,
             arg_parse_fn: Optional[Callable] = None,
             data_collator: Optional[Union[Callable, Dict[str,
                                                          Callable]]] = None,
@@ -49,7 +50,8 @@ class OFATrainer(EpochBasedTrainer):
             **kwargs):
         model = Model.from_pretrained(model, revision=model_revision)
         model_dir = model.model_dir
-        cfg = Config.from_file(cfg_file)
+        self.cfg_modify_fn = cfg_modify_fn
+        cfg = self.rebuild_config(Config.from_file(cfg_file))
         if 'work_dir' not in kwargs or len(kwargs['work_dir']) == 0:
             work_dir = cfg.train.work_dir
         else:
@@ -57,10 +59,12 @@ class OFATrainer(EpochBasedTrainer):
         tokenizer_files = {
             'zh': [
                 'tokenizer.json', 'tokenizer_config.json', 'vocab.txt',
-                'config.json'
+                'config.json', 'ans2label.json'
+            ],
+            'en': [
+                'tokenizer.json', 'vocab.json', 'merges.txt', 'config.json',
+                'ans2label.json'
             ],
-            'en':
-            ['tokenizer.json', 'vocab.json', 'merges.txt', 'config.json'],
         }
         for filename in tokenizer_files[cfg.model.get('language', 'en')]:
             finetune_file = os.path.join(work_dir, filename)
@@ -127,6 +131,11 @@ class OFATrainer(EpochBasedTrainer):
             **kwargs,
         )
 
+    def rebuild_config(self, cfg: Config):
+        if self.cfg_modify_fn is not None:
+            cfg = self.cfg_modify_fn(cfg)
+        return cfg
+
     def train_step(self, model, inputs):
         model.train()
         loss, sample_size, logging_output = self.criterion(model, inputs)
diff --git a/tests/trainers/test_ofa_trainer.py b/tests/trainers/test_ofa_trainer.py
index 85c21881..098416bb 100644
--- a/tests/trainers/test_ofa_trainer.py
+++ b/tests/trainers/test_ofa_trainer.py
@@ -5,10 +5,10 @@ import unittest
 
 import json
 
-from modelscope.metainfo import Trainers
 from modelscope.msdatasets import MsDataset
 from modelscope.trainers import build_trainer
 from modelscope.utils.constant import DownloadMode, ModelFile
+from modelscope.utils.hub import read_config
 from modelscope.utils.test_utils import test_level
 
 
@@ -73,11 +73,12 @@ class TestOfaTrainer(unittest.TestCase):
     def test_trainer_std(self):
         WORKSPACE = './workspace/ckpts/recognition'
         os.makedirs(WORKSPACE, exist_ok=True)
-        config_file = os.path.join(WORKSPACE, ModelFile.CONFIGURATION)
-        with open(config_file, 'w') as writer:
-            json.dump(self.finetune_cfg, writer)
 
         pretrained_model = 'damo/ofa_ocr-recognition_scene_base_zh'
+        cfg = read_config(pretrained_model)
+        config_file = os.path.join(WORKSPACE, ModelFile.CONFIGURATION)
+        cfg.dump(config_file)
+
         args = dict(
             model=pretrained_model,
             work_dir=WORKSPACE,
@@ -94,7 +95,7 @@ class TestOfaTrainer(unittest.TestCase):
                 split='test[:20]',
                 download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS),
             cfg_file=config_file)
-        trainer = build_trainer(name=Trainers.ofa, default_args=args)
+        trainer = build_trainer(name='ofa', default_args=args)
         trainer.train()
 
         self.assertIn(

From a02b2409d8f15bcce399b6def7ea0e23a724f51b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=BF=8E=E8=88=AA?= <liugao.lg@alibaba-inc.com>
Date: Tue, 8 Nov 2022 10:39:45 +0800
Subject: [PATCH 48/54] add five finetune task & merge master

---
 modelscope/preprocessors/ofa/summarization.py |  1 -
 tests/trainers/test_ofa_trainer.py            | 13 ++++++++++---
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/modelscope/preprocessors/ofa/summarization.py b/modelscope/preprocessors/ofa/summarization.py
index 8568a543..d33e9d25 100644
--- a/modelscope/preprocessors/ofa/summarization.py
+++ b/modelscope/preprocessors/ofa/summarization.py
@@ -46,7 +46,6 @@ class OfaSummarizationPreprocessor(OfaBasePreprocessor):
     def _build_infer_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
         source = super().pre_caption(
             data[self.column_map['text']], max_words=self.max_src_length)
-        # source = source.strip()[:self.max_src_length]
         source = source.replace('[unk]', 'unk').replace('<unk>', 'unk')
         prompt = self.cfg.model.get(
             'prompt', ' " {} " Summarize the article with a title: ')
diff --git a/tests/trainers/test_ofa_trainer.py b/tests/trainers/test_ofa_trainer.py
index 098416bb..f72400eb 100644
--- a/tests/trainers/test_ofa_trainer.py
+++ b/tests/trainers/test_ofa_trainer.py
@@ -71,13 +71,20 @@ class TestOfaTrainer(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_trainer_std(self):
+        # WORKSPACE = './workspace/ckpts/recognition'
+        # os.makedirs(WORKSPACE, exist_ok=True)
+        #
+        # pretrained_model = 'damo/ofa_ocr-recognition_scene_base_zh'
+        # cfg = read_config(pretrained_model)
+        # config_file = os.path.join(WORKSPACE, ModelFile.CONFIGURATION)
+        # cfg.dump(config_file)
         WORKSPACE = './workspace/ckpts/recognition'
         os.makedirs(WORKSPACE, exist_ok=True)
+        config_file = os.path.join(WORKSPACE, ModelFile.CONFIGURATION)
+        with open(config_file, 'w') as writer:
+            json.dump(self.finetune_cfg, writer)
 
         pretrained_model = 'damo/ofa_ocr-recognition_scene_base_zh'
-        cfg = read_config(pretrained_model)
-        config_file = os.path.join(WORKSPACE, ModelFile.CONFIGURATION)
-        cfg.dump(config_file)
 
         args = dict(
             model=pretrained_model,

From 353497070919286d647fc06a58ab115b2b1ebeef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=BF=8E=E8=88=AA?= <liugao.lg@alibaba-inc.com>
Date: Tue, 8 Nov 2022 10:39:59 +0800
Subject: [PATCH 49/54] add five finetune task & merge master

---
 tests/trainers/test_ofa_trainer.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/tests/trainers/test_ofa_trainer.py b/tests/trainers/test_ofa_trainer.py
index f72400eb..a678865a 100644
--- a/tests/trainers/test_ofa_trainer.py
+++ b/tests/trainers/test_ofa_trainer.py
@@ -71,13 +71,6 @@ class TestOfaTrainer(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_trainer_std(self):
-        # WORKSPACE = './workspace/ckpts/recognition'
-        # os.makedirs(WORKSPACE, exist_ok=True)
-        #
-        # pretrained_model = 'damo/ofa_ocr-recognition_scene_base_zh'
-        # cfg = read_config(pretrained_model)
-        # config_file = os.path.join(WORKSPACE, ModelFile.CONFIGURATION)
-        # cfg.dump(config_file)
         WORKSPACE = './workspace/ckpts/recognition'
         os.makedirs(WORKSPACE, exist_ok=True)
         config_file = os.path.join(WORKSPACE, ModelFile.CONFIGURATION)

From fd4276ad1a50dc534e6d95105fa5efcf50d9340e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=BF=8E=E8=88=AA?= <liugao.lg@alibaba-inc.com>
Date: Tue, 8 Nov 2022 10:57:22 +0800
Subject: [PATCH 50/54] add five finetune task & merge master

---
 tests/trainers/test_ofa_trainer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/trainers/test_ofa_trainer.py b/tests/trainers/test_ofa_trainer.py
index a678865a..0516e569 100644
--- a/tests/trainers/test_ofa_trainer.py
+++ b/tests/trainers/test_ofa_trainer.py
@@ -5,6 +5,7 @@ import unittest
 
 import json
 
+from modelscope.metainfo import Trainers
 from modelscope.msdatasets import MsDataset
 from modelscope.trainers import build_trainer
 from modelscope.utils.constant import DownloadMode, ModelFile
@@ -95,7 +96,7 @@ class TestOfaTrainer(unittest.TestCase):
                 split='test[:20]',
                 download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS),
             cfg_file=config_file)
-        trainer = build_trainer(name='ofa', default_args=args)
+        trainer = build_trainer(name=Trainers.ofa, default_args=args)
         trainer.train()
 
         self.assertIn(

From dc1b88b3964bea6116b41eeb9634cd38a8cac362 Mon Sep 17 00:00:00 2001
From: "hemu.zp" <hemu.zp@alibaba-inc.com>
Date: Tue, 8 Nov 2022 14:05:01 +0800
Subject: [PATCH 51/54] [to #42322933] Fix bug for distributed gpt3
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

修复 Pipeline 更新后 DistributedGPT3Pipeline 出现的报错
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10614071
---
 modelscope/pipelines/base.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/modelscope/pipelines/base.py b/modelscope/pipelines/base.py
index 68010012..7a8bfd14 100644
--- a/modelscope/pipelines/base.py
+++ b/modelscope/pipelines/base.py
@@ -366,6 +366,7 @@ class DistributedPipeline(Pipeline):
                 master_port=master_port,
                 **self.cfg.model,
                 **kwargs), ranks)
+        self.models = []
 
     def __del__(self):
         if hasattr(self, 'model_pool') and self.model_pool is not None:

From d3519bcbca98c0fdf290966ff29d08e6d3698900 Mon Sep 17 00:00:00 2001
From: "zhangzhicheng.zzc" <zhangzhicheng.zzc@alibaba-inc.com>
Date: Tue, 8 Nov 2022 15:42:08 +0800
Subject: [PATCH 52/54] [to #42322933]token preprocess bug fix         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10608664

---
 .../nlp/token_classification_preprocessor.py   | 18 ++++++++++++------
 ...st_multilingual_named_entity_recognition.py | 10 ++++++++++
 .../pipelines/test_named_entity_recognition.py | 10 +++++++++-
 3 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/modelscope/preprocessors/nlp/token_classification_preprocessor.py b/modelscope/preprocessors/nlp/token_classification_preprocessor.py
index 92b7c46b..a7616736 100644
--- a/modelscope/preprocessors/nlp/token_classification_preprocessor.py
+++ b/modelscope/preprocessors/nlp/token_classification_preprocessor.py
@@ -73,10 +73,12 @@ class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase):
         super().__init__(model_dir, mode=mode, **kwargs)
 
         if 'is_split_into_words' in kwargs:
-            self.is_split_into_words = kwargs.pop('is_split_into_words')
+            self.tokenize_kwargs['is_split_into_words'] = kwargs.pop(
+                'is_split_into_words')
         else:
-            self.is_split_into_words = self.tokenizer.init_kwargs.get(
-                'is_split_into_words', False)
+            self.tokenize_kwargs[
+                'is_split_into_words'] = self.tokenizer.init_kwargs.get(
+                    'is_split_into_words', False)
         if 'label2id' in kwargs:
             kwargs.pop('label2id')
 
@@ -99,7 +101,6 @@ class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase):
         if isinstance(data, str):
             # for inference inputs without label
             text = data
-            self.tokenize_kwargs['add_special_tokens'] = False
         elif isinstance(data, dict):
             # for finetune inputs with label
             text = data.get(self.first_sequence)
@@ -107,11 +108,15 @@ class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase):
             if isinstance(text, list):
                 self.tokenize_kwargs['is_split_into_words'] = True
 
+        if self._mode == ModeKeys.INFERENCE:
+            self.tokenize_kwargs['add_special_tokens'] = False
+
         input_ids = []
         label_mask = []
         offset_mapping = []
         token_type_ids = []
-        if self.is_split_into_words and self._mode == ModeKeys.INFERENCE:
+        if self.tokenize_kwargs[
+                'is_split_into_words'] and self._mode == ModeKeys.INFERENCE:
             for offset, token in enumerate(list(text)):
                 subtoken_ids = self.tokenizer.encode(token,
                                                      **self.tokenize_kwargs)
@@ -125,7 +130,8 @@ class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase):
                 encodings = self.tokenizer(
                     text, return_offsets_mapping=True, **self.tokenize_kwargs)
                 attention_mask = encodings['attention_mask']
-                token_type_ids = encodings['token_type_ids']
+                if 'token_type_ids' in encodings:
+                    token_type_ids = encodings['token_type_ids']
                 input_ids = encodings['input_ids']
                 word_ids = encodings.word_ids()
                 for i in range(len(word_ids)):
diff --git a/tests/pipelines/test_multilingual_named_entity_recognition.py b/tests/pipelines/test_multilingual_named_entity_recognition.py
index 6f72c83c..cb2b32d6 100644
--- a/tests/pipelines/test_multilingual_named_entity_recognition.py
+++ b/tests/pipelines/test_multilingual_named_entity_recognition.py
@@ -27,6 +27,9 @@ class MultilingualNamedEntityRecognitionTest(unittest.TestCase,
     viet_tcrf_model_id = 'damo/nlp_xlmr_named-entity-recognition_viet-ecommerce-title'
     viet_sentence = 'Nón vành dễ thương cho bé gái'
 
+    multilingual_model_id = 'damo/nlp_raner_named-entity-recognition_multilingual-large-generic'
+    ml_stc = 'সমস্ত বেতন নিলামের সাধারণ ব্যবহারিক উদাহরণ বিভিন্ন পেনি নিলাম / বিডিং ফি নিলাম ওয়েবসাইটে পাওয়া যাবে।'
+
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_tcrf_by_direct_model_download_thai(self):
         cache_path = snapshot_download(self.thai_tcrf_model_id)
@@ -60,6 +63,13 @@ class MultilingualNamedEntityRecognitionTest(unittest.TestCase,
             task=Tasks.named_entity_recognition, model=self.thai_tcrf_model_id)
         print(pipeline_ins(input=self.thai_sentence))
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_tcrf_with_model_name_multilingual(self):
+        pipeline_ins = pipeline(
+            task=Tasks.named_entity_recognition,
+            model=self.multilingual_model_id)
+        print(pipeline_ins(input=self.ml_stc))
+
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_tcrf_by_direct_model_download_viet(self):
         cache_path = snapshot_download(self.viet_tcrf_model_id)
diff --git a/tests/pipelines/test_named_entity_recognition.py b/tests/pipelines/test_named_entity_recognition.py
index aef4aaed..0df44f5b 100644
--- a/tests/pipelines/test_named_entity_recognition.py
+++ b/tests/pipelines/test_named_entity_recognition.py
@@ -20,10 +20,12 @@ class NamedEntityRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):
         self.model_id = 'damo/nlp_raner_named-entity-recognition_chinese-base-news'
 
     english_model_id = 'damo/nlp_raner_named-entity-recognition_english-large-ecom'
+    chinese_model_id = 'damo/nlp_raner_named-entity-recognition_chinese-large-generic'
     tcrf_model_id = 'damo/nlp_raner_named-entity-recognition_chinese-base-news'
     lcrf_model_id = 'damo/nlp_lstm_named-entity-recognition_chinese-news'
     sentence = '这与温岭市新河镇的一个神秘的传说有关。'
     sentence_en = 'pizza shovel'
+    sentence_zh = '他 继 续 与 貝 塞 斯 達 遊 戲 工 作 室 在 接 下 来 辐 射 4 游 戏 。'
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_tcrf_by_direct_model_download(self):
@@ -91,11 +93,17 @@ class NamedEntityRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):
             task=Tasks.named_entity_recognition, model=self.lcrf_model_id)
         print(pipeline_ins(input=self.sentence))
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_lcrf_with_chinese_model_name(self):
+        pipeline_ins = pipeline(
+            task=Tasks.named_entity_recognition, model=self.chinese_model_id)
+        print(pipeline_ins(input=self.sentence_zh))
+
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_english_with_model_name(self):
         pipeline_ins = pipeline(
             task=Tasks.named_entity_recognition, model=self.english_model_id)
-        print(pipeline_ins(input='pizza shovel'))
+        print(pipeline_ins(input=self.sentence_en))
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):

From 57499c248c8521a60556c0e025fb5b1fa2034166 Mon Sep 17 00:00:00 2001
From: Yingda Chen <yingda.chen@alibaba-inc.com>
Date: Tue, 8 Nov 2022 16:05:53 +0800
Subject: [PATCH 53/54] [to #42322933] update readme

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 1da48ef2..3d90c7ef 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,10 @@
 # Introduction
 
-[ModelScope]( https://www.modelscope.cn) is a “Model-as-a-Service” (MaaS) platform that seeks to bringing together most advanced machine learning models from the AI community, and to streamlining the process of leveraging and applying AI models . The core ModelScope library enables developers to perform model inference, training and evaluation, through rich layers of API designs that facilitate a unified experience across state-of-the-art models from different AI domains.
+[ModelScope]( https://www.modelscope.cn) is a “Model-as-a-Service” (MaaS) platform that seeks to bring together most advanced machine learning models from the AI community, and to streamline the process of leveraging AI models in real applications. The core ModelScope library enables developers to perform inference, training and evaluation, through rich layers of API designs that facilitate a unified experience across state-of-the-art models from different AI domains.
 
-The Python library offers the layered-APIs necessary for model contributors to integrate models from CV, NLP, Speech, Multi-Modality, as well as Scientific-computation, into the ModelScope ecosystem. Implementations for all these different models are encapsulated within the library in a way that allows easy and unified access. With such integration, model inference, finetuning, and evaluations can be done within only a few lines of codes. In the meantime, flexibilities are provided so that different components in the model applications can be customized as well, where necessary.
+The Python library offers the layered-APIs necessary for model contributors to integrate models from CV, NLP, Speech, Multi-Modality, as well as Scientific-computation, into the ModelScope ecosystem. Implementations for all these different models are encapsulated within the library in a way that allows easy and unified access. With such integration, model inference, finetuning, and evaluations can be done with only a few lines of codes. In the meantime, flexibilities are provided so that different components in the model applications can be customized as well, where necessary.
 
-Apart from harboring implementations of various models, ModelScope library also enables the necessary interactions with the backend services of ModelScope, particularly with the Model-Hub and Dataset-Hub. Such interactions facilitate various entity (models and datasets) management to be performed seamlessly under-the-hood, such as entity lookup, version control, and cache management.
+Apart from harboring implementations of various models, ModelScope library also enables the necessary interactions with ModelScope backend services, particularly with the Model-Hub and Dataset-Hub. Such interactions facilitate management of  various entities (models and datasets) to be performed seamlessly under-the-hood, including entity lookup, version control, cache management, and many others.
 
 # Installation
 

From 0f0fdcae6fb981bd5e91ab0beada03cdd08854a4 Mon Sep 17 00:00:00 2001
From: "hemu.zp" <hemu.zp@alibaba-inc.com>
Date: Tue, 8 Nov 2022 17:58:03 +0800
Subject: [PATCH 54/54] [to #42322933] Fix bug for mplug evaluation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

修复了 mplug evaluation 使用了错误的 metrics 的问题，将部分中文处理代码独立到 utils 中，为 mplug 添加 trainer
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10612875
---
 modelscope/metainfo.py                        |  1 +
 modelscope/metrics/accuracy_metric.py         |  7 ++--
 modelscope/metrics/builder.py                 |  4 +-
 modelscope/metrics/text_generation_metric.py  | 17 ++------
 .../models/multi_modal/mplug_for_all_tasks.py | 26 +-----------
 .../models/nlp/task_models/text_generation.py |  5 ++-
 .../pipelines/nlp/text_generation_pipeline.py | 25 +-----------
 modelscope/trainers/multi_modal/__init__.py   |  6 ++-
 .../trainers/multi_modal/mplug/__init__.py    |  3 ++
 .../multi_modal/mplug/mplug_trainer.py        | 40 +++++++++++++++++++
 modelscope/utils/chinese_utils.py             | 35 ++++++++++++++++
 tests/trainers/test_finetune_mplug.py         | 38 +++++-------------
 tests/utils/test_ast.py                       |  2 +-
 13 files changed, 111 insertions(+), 98 deletions(-)
 create mode 100644 modelscope/trainers/multi_modal/mplug/__init__.py
 create mode 100644 modelscope/trainers/multi_modal/mplug/mplug_trainer.py
 create mode 100644 modelscope/utils/chinese_utils.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 2df6f2a0..c7c3e729 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -299,6 +299,7 @@ class Trainers(object):
     # multi-modal trainers
     clip_multi_modal_embedding = 'clip-multi-modal-embedding'
     ofa = 'ofa'
+    mplug = 'mplug'
 
     # cv trainers
     image_instance_segmentation = 'image-instance-segmentation'
diff --git a/modelscope/metrics/accuracy_metric.py b/modelscope/metrics/accuracy_metric.py
index 953ece4c..fe040177 100644
--- a/modelscope/metrics/accuracy_metric.py
+++ b/modelscope/metrics/accuracy_metric.py
@@ -6,6 +6,7 @@ import numpy as np
 
 from modelscope.metainfo import Metrics
 from modelscope.outputs import OutputKeys
+from modelscope.utils.chinese_utils import remove_space_between_chinese_chars
 from modelscope.utils.registry import default_group
 from .base import Metric
 from .builder import METRICS, MetricKeys
@@ -26,10 +27,10 @@ class AccuracyMetric(Metric):
     def add(self, outputs: Dict, inputs: Dict):
         label_name = OutputKeys.LABEL if OutputKeys.LABEL in inputs else OutputKeys.LABELS
         ground_truths = inputs[label_name]
-        eval_results = outputs[label_name]
+        eval_results = None
         for key in [
                 OutputKeys.CAPTION, OutputKeys.TEXT, OutputKeys.BOXES,
-                OutputKeys.LABELS, OutputKeys.SCORES
+                OutputKeys.LABEL, OutputKeys.LABELS, OutputKeys.SCORES
         ]:
             if key in outputs and outputs[key] is not None:
                 eval_results = outputs[key]
@@ -39,7 +40,7 @@ class AccuracyMetric(Metric):
             self.labels.append(truth)
         for result in eval_results:
             if isinstance(truth, str):
-                self.preds.append(result.strip().replace(' ', ''))
+                self.preds.append(remove_space_between_chinese_chars(result))
             else:
                 self.preds.append(result)
 
diff --git a/modelscope/metrics/builder.py b/modelscope/metrics/builder.py
index e2fe67f8..03d4c324 100644
--- a/modelscope/metrics/builder.py
+++ b/modelscope/metrics/builder.py
@@ -41,8 +41,8 @@ task_default_metrics = {
     Tasks.image_portrait_enhancement:
     [Metrics.image_portrait_enhancement_metric],
     Tasks.video_summarization: [Metrics.video_summarization_metric],
-    Tasks.image_captioning: [Metrics.text_gen_metric],
-    Tasks.visual_question_answering: [Metrics.text_gen_metric],
+    Tasks.image_captioning: [Metrics.accuracy],
+    Tasks.visual_question_answering: [Metrics.accuracy],
     Tasks.movie_scene_segmentation: [Metrics.movie_scene_segmentation_metric],
     Tasks.image_inpainting: [Metrics.image_inpainting_metric],
     Tasks.referring_video_object_segmentation:
diff --git a/modelscope/metrics/text_generation_metric.py b/modelscope/metrics/text_generation_metric.py
index c2d9c6a8..08df5235 100644
--- a/modelscope/metrics/text_generation_metric.py
+++ b/modelscope/metrics/text_generation_metric.py
@@ -8,6 +8,7 @@ from rouge import Rouge
 from modelscope.metainfo import Metrics
 from modelscope.metrics.base import Metric
 from modelscope.metrics.builder import METRICS, MetricKeys
+from modelscope.utils.chinese_utils import rebuild_chinese_str
 from modelscope.utils.registry import default_group
 
 
@@ -24,25 +25,13 @@ class TextGenerationMetric(Metric):
         self.tgts: List[str] = []
         self.rouge = Rouge()
 
-    @staticmethod
-    def is_chinese_char(char: str):
-        # the length of char must be 1
-        return '\u4e00' <= char <= '\u9fa5'
-
-    # add space for each chinese char
-    def rebuild_str(self, string: str):
-        return ' '.join(''.join([
-            f' {char} ' if self.is_chinese_char(char) else char
-            for char in string
-        ]).split())
-
     def add(self, outputs: Dict[str, List[str]], inputs: Dict[str, List[str]]):
         ground_truths = inputs['tgts']
         eval_results = outputs['preds']
         for truth in ground_truths:
-            self.tgts.append(self.rebuild_str(truth))
+            self.tgts.append(rebuild_chinese_str(truth))
         for result in eval_results:
-            self.preds.append(self.rebuild_str(result))
+            self.preds.append(rebuild_chinese_str(result))
 
     def _check(self, pred: str, tgt: str) -> bool:
 
diff --git a/modelscope/models/multi_modal/mplug_for_all_tasks.py b/modelscope/models/multi_modal/mplug_for_all_tasks.py
index 64a7dd7b..7de8d291 100644
--- a/modelscope/models/multi_modal/mplug_for_all_tasks.py
+++ b/modelscope/models/multi_modal/mplug_for_all_tasks.py
@@ -45,10 +45,6 @@ class MPlugForAllTasks(TorchModel):
                     }
         """
 
-        replace_tokens_bert = (('[unused0]', ''), ('[PAD]', ''),
-                               ('[unused1]', ''), (r' +', ' '), ('[SEP]', ''),
-                               ('[unused2]', ''), ('[CLS]', ''), ('[UNK]', ''))
-
         # get task from config file
         task = Config.from_file(
             osp.join(self.model_dir, ModelFile.CONFIGURATION)).task
@@ -60,10 +56,7 @@ class MPlugForAllTasks(TorchModel):
                 return {OutputKeys.SCORES: output[0].tolist()}
             topk_ids, _ = output
             pred_string: List[str] = \
-                self.tokenizer.decode(topk_ids[0][0])
-            for _old, _new in replace_tokens_bert:
-                pred_string = pred_string.replace(_old, _new)
-            pred_string = pred_string.strip()
+                self.tokenizer.decode(topk_ids[0][0], skip_special_tokens=True)
             output_key = OutputKeys.CAPTION \
                 if task == Tasks.image_captioning else OutputKeys.TEXT
             return {output_key: pred_string}
@@ -87,19 +80,4 @@ class MPlugForAllTasks(TorchModel):
 
         # evaluate
         topk_ids, _ = output
-        preds: List[str] = [
-            self.tokenizer.decode(batch[0]) for batch in topk_ids
-        ]
-        for i in range(len(preds)):
-            for _old, _new in replace_tokens_bert:
-                preds[i] = preds[i].replace(_old, _new)
-            preds[i] = preds[i].strip()
-        tgts: List[str] = [
-            self.tokenizer.decode(batch)
-            for batch in input['answer_input_ids'].cpu().numpy().tolist()
-        ]
-        for i in range(len(tgts)):
-            for _old, _new in replace_tokens_bert:
-                tgts[i] = tgts[i].replace(_old, _new)
-            preds[i] = preds[i].strip()
-        return {'preds': preds, 'tgts': tgts}
+        return {'sequences': [list_tensor[0] for list_tensor in topk_ids]}
diff --git a/modelscope/models/nlp/task_models/text_generation.py b/modelscope/models/nlp/task_models/text_generation.py
index cd8e20cf..b886f124 100644
--- a/modelscope/models/nlp/task_models/text_generation.py
+++ b/modelscope/models/nlp/task_models/text_generation.py
@@ -2,7 +2,7 @@
 from typing import Any, Dict
 
 import numpy as np
-from transformers.modeling_utils import PreTrainedModel
+from transformers.modeling_utils import GenerationMixin
 
 from modelscope.metainfo import TaskModels
 from modelscope.models.builder import MODELS
@@ -17,7 +17,8 @@ __all__ = ['TaskModelForTextGeneration']
 
 @MODELS.register_module(
     Tasks.text_generation, module_name=TaskModels.text_generation)
-class TaskModelForTextGeneration(SingleBackboneTaskModelBase, PreTrainedModel):
+class TaskModelForTextGeneration(SingleBackboneTaskModelBase, GenerationMixin):
+    main_input_name = 'input_ids'
 
     def __init__(self, model_dir: str, *args, **kwargs):
         """initialize the text generation model from the `model_dir` path.
diff --git a/modelscope/pipelines/nlp/text_generation_pipeline.py b/modelscope/pipelines/nlp/text_generation_pipeline.py
index fdde5f25..0490c8e7 100644
--- a/modelscope/pipelines/nlp/text_generation_pipeline.py
+++ b/modelscope/pipelines/nlp/text_generation_pipeline.py
@@ -10,6 +10,7 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline, Tensor
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import Preprocessor, build_preprocessor
+from modelscope.utils.chinese_utils import remove_space_between_chinese_chars
 from modelscope.utils.constant import Fields, Tasks
 from modelscope.utils.hub import read_config
 
@@ -78,28 +79,6 @@ class TextGenerationPipeline(Pipeline):
         with torch.no_grad():
             return self.model.generate(inputs, **forward_params)
 
-    def _is_chinese_char(self, word: str):
-        chinese_punctuations = ('，', '。', '；', '：' '！', '？', '《', '》')
-        return len(word) == 1 \
-            and ('\u4e00' <= word <= '\u9fa5' or word in chinese_punctuations)
-
-    def _remove_space_between_chinese_chars(self, decoded: str):
-        old_word_list = decoded.split(' ')
-        new_word_list = []
-        start = -1
-        for i, word in enumerate(old_word_list):
-            if self._is_chinese_char(word):
-                if start == -1:
-                    start = i
-            else:
-                if start != -1:
-                    new_word_list.append(''.join(old_word_list[start:i]))
-                    start = -1
-                new_word_list.append(word)
-        if start != -1:
-            new_word_list.append(''.join(old_word_list[start:]))
-        return ' '.join(new_word_list)
-
     def decode(self, inputs) -> str:
         tokenizer = self.preprocessor.tokenizer
         return tokenizer.decode(inputs.tolist(), skip_special_tokens=True)
@@ -128,5 +107,5 @@ class TextGenerationPipeline(Pipeline):
         if isinstance(inputs, list) or len(inputs.shape) > 1:
             inputs = inputs[0]
         decoded = getattr(self, self.postprocessor)(inputs)
-        text = self._remove_space_between_chinese_chars(decoded)
+        text = remove_space_between_chinese_chars(decoded)
         return {OutputKeys.TEXT: text}
diff --git a/modelscope/trainers/multi_modal/__init__.py b/modelscope/trainers/multi_modal/__init__.py
index 448f23a3..6840b573 100644
--- a/modelscope/trainers/multi_modal/__init__.py
+++ b/modelscope/trainers/multi_modal/__init__.py
@@ -6,11 +6,15 @@ from modelscope.utils.import_utils import LazyImportModule
 if TYPE_CHECKING:
     from .clip import CLIPTrainer
     from .team import TEAMImgClsTrainer
+    from .ofa import OFATrainer
+    from .mplug import MPlugTrainer
 
 else:
     _import_structure = {
         'clip': ['CLIPTrainer'],
-        'team': ['TEAMImgClsTrainer']
+        'team': ['TEAMImgClsTrainer'],
+        'ofa': ['OFATrainer'],
+        'mplug': ['MPlugTrainer'],
     }
 
     import sys
diff --git a/modelscope/trainers/multi_modal/mplug/__init__.py b/modelscope/trainers/multi_modal/mplug/__init__.py
new file mode 100644
index 00000000..caf7e3f0
--- /dev/null
+++ b/modelscope/trainers/multi_modal/mplug/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from .mplug_trainer import MPlugTrainer
diff --git a/modelscope/trainers/multi_modal/mplug/mplug_trainer.py b/modelscope/trainers/multi_modal/mplug/mplug_trainer.py
new file mode 100644
index 00000000..def66220
--- /dev/null
+++ b/modelscope/trainers/multi_modal/mplug/mplug_trainer.py
@@ -0,0 +1,40 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from collections.abc import Mapping
+
+import torch
+
+from modelscope.metainfo import Trainers
+from modelscope.outputs import OutputKeys
+from modelscope.trainers import NlpEpochBasedTrainer
+from modelscope.trainers.builder import TRAINERS
+from modelscope.utils.file_utils import func_receive_dict_inputs
+
+
+@TRAINERS.register_module(module_name=Trainers.mplug)
+class MPlugTrainer(NlpEpochBasedTrainer):
+
+    def _decode(self, tokens):
+        tokenizer = self.eval_preprocessor.tokenizer
+        return tokenizer.decode(tokens, skip_special_tokens=True)
+
+    def evaluation_step(self, data):
+        model = self.model.module if self._dist else self.model
+        model.eval()
+
+        with torch.no_grad():
+            if isinstance(
+                    data,
+                    Mapping) and not func_receive_dict_inputs(model.forward):
+                result = model.forward(**data)
+            else:
+                result = model.forward(data)
+
+        result[OutputKeys.TEXT] = [
+            self._decode(seq) for seq in result['sequences']
+        ]
+        data[OutputKeys.LABELS] = [
+            self._decode(seq) for seq in data['answer_input_ids']
+        ]
+
+        return result
diff --git a/modelscope/utils/chinese_utils.py b/modelscope/utils/chinese_utils.py
new file mode 100644
index 00000000..e5fe7aa8
--- /dev/null
+++ b/modelscope/utils/chinese_utils.py
@@ -0,0 +1,35 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+
+def is_chinese_char(word: str):
+    chinese_punctuations = {
+        '，', '。', '；', '：'
+        '！', '？', '《', '》', '‘', '’', '“', '”', '（', '）', '【', '】'
+    }
+    return len(word) == 1 \
+        and ('\u4e00' <= word <= '\u9fa5' or word in chinese_punctuations)
+
+
+def remove_space_between_chinese_chars(decoded_str: str):
+    old_word_list = decoded_str.split(' ')
+    new_word_list = []
+    start = -1
+    for i, word in enumerate(old_word_list):
+        if is_chinese_char(word):
+            if start == -1:
+                start = i
+        else:
+            if start != -1:
+                new_word_list.append(''.join(old_word_list[start:i]))
+                start = -1
+            new_word_list.append(word)
+    if start != -1:
+        new_word_list.append(''.join(old_word_list[start:]))
+    return ' '.join(new_word_list).strip()
+
+
+# add space for each chinese char
+def rebuild_chinese_str(string: str):
+    return ' '.join(''.join([
+        f' {char} ' if is_chinese_char(char) else char for char in string
+    ]).split())
diff --git a/tests/trainers/test_finetune_mplug.py b/tests/trainers/test_finetune_mplug.py
index 4972a731..46664114 100644
--- a/tests/trainers/test_finetune_mplug.py
+++ b/tests/trainers/test_finetune_mplug.py
@@ -20,10 +20,7 @@ class TestFinetuneMPlug(unittest.TestCase):
         self.tmp_dir = tempfile.TemporaryDirectory().name
         if not os.path.exists(self.tmp_dir):
             os.makedirs(self.tmp_dir)
-        from modelscope.utils.constant import DownloadMode
-        datadict = MsDataset.load(
-            'coco_captions_small_slice',
-            download_mode=DownloadMode.FORCE_REDOWNLOAD)
+        datadict = MsDataset.load('coco_captions_small_slice')
         self.train_dataset = MsDataset(
             datadict['train'].remap_columns({
                 'image:FILE': 'image',
@@ -40,18 +37,6 @@ class TestFinetuneMPlug(unittest.TestCase):
         shutil.rmtree(self.tmp_dir)
         super().tearDown()
 
-    def _cfg_modify_fn(self, cfg):
-        cfg.train.hooks = [{
-            'type': 'CheckpointHook',
-            'interval': self.max_epochs
-        }, {
-            'type': 'TextLoggerHook',
-            'interval': 1
-        }, {
-            'type': 'IterTimerHook'
-        }]
-        return cfg
-
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_trainer_with_caption(self):
         kwargs = dict(
@@ -59,11 +44,10 @@ class TestFinetuneMPlug(unittest.TestCase):
             train_dataset=self.train_dataset,
             eval_dataset=self.test_dataset,
             max_epochs=self.max_epochs,
-            work_dir=self.tmp_dir,
-            cfg_modify_fn=self._cfg_modify_fn)
+            work_dir=self.tmp_dir)
 
         trainer: EpochBasedTrainer = build_trainer(
-            name=Trainers.nlp_base_trainer, default_args=kwargs)
+            name=Trainers.mplug, default_args=kwargs)
         trainer.train()
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
@@ -80,7 +64,7 @@ class TestFinetuneMPlug(unittest.TestCase):
             work_dir=self.tmp_dir)
 
         trainer: EpochBasedTrainer = build_trainer(
-            name=Trainers.nlp_base_trainer, default_args=kwargs)
+            name=Trainers.mplug, default_args=kwargs)
         trainer.train()
         results_files = os.listdir(self.tmp_dir)
         self.assertIn(f'{trainer.timestamp}.log.json', results_files)
@@ -94,11 +78,10 @@ class TestFinetuneMPlug(unittest.TestCase):
             train_dataset=self.train_dataset,
             eval_dataset=self.test_dataset,
             max_epochs=self.max_epochs,
-            work_dir=self.tmp_dir,
-            cfg_modify_fn=self._cfg_modify_fn)
+            work_dir=self.tmp_dir)
 
         trainer: EpochBasedTrainer = build_trainer(
-            name=Trainers.nlp_base_trainer, default_args=kwargs)
+            name=Trainers.mplug, default_args=kwargs)
         trainer.train()
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
@@ -115,7 +98,7 @@ class TestFinetuneMPlug(unittest.TestCase):
             work_dir=self.tmp_dir)
 
         trainer: EpochBasedTrainer = build_trainer(
-            name=Trainers.nlp_base_trainer, default_args=kwargs)
+            name=Trainers.mplug, default_args=kwargs)
         trainer.train()
         results_files = os.listdir(self.tmp_dir)
         self.assertIn(f'{trainer.timestamp}.log.json', results_files)
@@ -129,11 +112,10 @@ class TestFinetuneMPlug(unittest.TestCase):
             train_dataset=self.train_dataset,
             eval_dataset=self.test_dataset,
             max_epochs=self.max_epochs,
-            work_dir=self.tmp_dir,
-            cfg_modify_fn=self._cfg_modify_fn)
+            work_dir=self.tmp_dir)
 
         trainer: EpochBasedTrainer = build_trainer(
-            name=Trainers.nlp_base_trainer, default_args=kwargs)
+            name=Trainers.mplug, default_args=kwargs)
         trainer.train()
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
@@ -150,7 +132,7 @@ class TestFinetuneMPlug(unittest.TestCase):
             work_dir=self.tmp_dir)
 
         trainer: EpochBasedTrainer = build_trainer(
-            name=Trainers.nlp_base_trainer, default_args=kwargs)
+            name=Trainers.mplug, default_args=kwargs)
         trainer.train()
         results_files = os.listdir(self.tmp_dir)
         self.assertIn(f'{trainer.timestamp}.log.json', results_files)
diff --git a/tests/utils/test_ast.py b/tests/utils/test_ast.py
index c0624679..0243053e 100644
--- a/tests/utils/test_ast.py
+++ b/tests/utils/test_ast.py
@@ -41,7 +41,7 @@ class AstScaningTest(unittest.TestCase):
         self.assertIsInstance(from_imports, dict)
         self.assertIsInstance(decorators, list)
         self.assertListEqual(list(set(imports.keys()) - set(['torch'])), [])
-        self.assertEqual(len(from_imports.keys()), 9)
+        self.assertEqual(len(from_imports.keys()), 10)
         self.assertTrue(from_imports['modelscope.metainfo'] is not None)
         self.assertEqual(from_imports['modelscope.metainfo'], ['Pipelines'])
         self.assertEqual(decorators,