Browse Source

add basic remap column wrapper

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10539917

    * add basic remap column wrapper
master
yingda.chen 2 years ago
parent
commit
de708dd518
3 changed files with 32 additions and 15 deletions
  1. +12
    -0
      modelscope/msdatasets/ms_dataset.py
  2. +10
    -11
      tests/trainers/test_finetune_mplug.py
  3. +10
    -4
      tests/trainers/test_finetune_text_generation.py

+ 12
- 0
modelscope/msdatasets/ms_dataset.py View File

@@ -563,6 +563,18 @@ class MsDataset:
self._hf_ds.reset_format()
return self._hf_ds

def remap_columns(self, column_mapping: Dict[str, str]) -> Dataset:
"""
Rename columns and return the underlying hf dataset directly
TODO: support native MsDataset column rename.
Args:
column_mapping: the mapping of the original and new column names
Returns:
underlying hf dataset
"""
self._hf_ds.reset_format()
return self._hf_ds.rename_columns(column_mapping)

@staticmethod
def upload(object_name: str,
local_file_path: str,


+ 10
- 11
tests/trainers/test_finetune_mplug.py View File

@@ -24,17 +24,16 @@ class TestFinetuneMPlug(unittest.TestCase):
datadict = MsDataset.load(
'coco_captions_small_slice',
download_mode=DownloadMode.FORCE_REDOWNLOAD)
self.train_dataset = MsDataset(datadict['train'].to_hf_dataset().map(
lambda _: {
'question': 'what the picture describes?'
}).rename_column('image:FILE',
'image').rename_column('answer:Value', 'answer'))
self.test_dataset = MsDataset(datadict['test'].to_hf_dataset().map(
lambda _: {
'question': 'what the picture describes?'
}).rename_column('image:FILE',
'image').rename_column('answer:Value', 'answer'))

self.train_dataset = MsDataset(
datadict['train'].remap_columns({
'image:FILE': 'image',
'answer:Value': 'answer'
}).map(lambda _: {'question': 'what the picture describes?'}))
self.test_dataset = MsDataset(
datadict['test'].remap_columns({
'image:FILE': 'image',
'answer:Value': 'answer'
}).map(lambda _: {'question': 'what the picture describes?'}))
self.max_epochs = 2

def tearDown(self):


+ 10
- 4
tests/trainers/test_finetune_text_generation.py View File

@@ -130,10 +130,16 @@ class TestFinetuneTextGeneration(unittest.TestCase):
def test_finetune_cnndm(self):
from modelscope.msdatasets import MsDataset
dataset_dict = MsDataset.load('DuReader_robust-QG')
train_dataset = dataset_dict['train'].to_hf_dataset() \
.rename_columns({'text1': 'src_txt', 'text2': 'tgt_txt'})
eval_dataset = dataset_dict['validation'].to_hf_dataset() \
.rename_columns({'text1': 'src_txt', 'text2': 'tgt_txt'})
train_dataset = dataset_dict['train'].remap_columns({
'text1': 'src_txt',
'text2': 'tgt_txt'
})
eval_dataset = dataset_dict['validation'].remap_columns({
'text1':
'src_txt',
'text2':
'tgt_txt'
})
num_warmup_steps = 200
os.environ['LOCAL_RANK'] = '0'



Loading…
Cancel
Save