Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10539917 * add basic remap column wrappermaster
| @@ -563,6 +563,18 @@ class MsDataset: | |||||
| self._hf_ds.reset_format() | self._hf_ds.reset_format() | ||||
| return self._hf_ds | return self._hf_ds | ||||
| def remap_columns(self, column_mapping: Dict[str, str]) -> Dataset: | |||||
| """ | |||||
| Rename columns and return the underlying hf dataset directly | |||||
| TODO: support native MsDataset column rename. | |||||
| Args: | |||||
| column_mapping: the mapping of the original and new column names | |||||
| Returns: | |||||
| underlying hf dataset | |||||
| """ | |||||
| self._hf_ds.reset_format() | |||||
| return self._hf_ds.rename_columns(column_mapping) | |||||
| @staticmethod | @staticmethod | ||||
| def upload(object_name: str, | def upload(object_name: str, | ||||
| local_file_path: str, | local_file_path: str, | ||||
| @@ -24,17 +24,16 @@ class TestFinetuneMPlug(unittest.TestCase): | |||||
| datadict = MsDataset.load( | datadict = MsDataset.load( | ||||
| 'coco_captions_small_slice', | 'coco_captions_small_slice', | ||||
| download_mode=DownloadMode.FORCE_REDOWNLOAD) | download_mode=DownloadMode.FORCE_REDOWNLOAD) | ||||
| self.train_dataset = MsDataset(datadict['train'].to_hf_dataset().map( | |||||
| lambda _: { | |||||
| 'question': 'what the picture describes?' | |||||
| }).rename_column('image:FILE', | |||||
| 'image').rename_column('answer:Value', 'answer')) | |||||
| self.test_dataset = MsDataset(datadict['test'].to_hf_dataset().map( | |||||
| lambda _: { | |||||
| 'question': 'what the picture describes?' | |||||
| }).rename_column('image:FILE', | |||||
| 'image').rename_column('answer:Value', 'answer')) | |||||
| self.train_dataset = MsDataset( | |||||
| datadict['train'].remap_columns({ | |||||
| 'image:FILE': 'image', | |||||
| 'answer:Value': 'answer' | |||||
| }).map(lambda _: {'question': 'what the picture describes?'})) | |||||
| self.test_dataset = MsDataset( | |||||
| datadict['test'].remap_columns({ | |||||
| 'image:FILE': 'image', | |||||
| 'answer:Value': 'answer' | |||||
| }).map(lambda _: {'question': 'what the picture describes?'})) | |||||
| self.max_epochs = 2 | self.max_epochs = 2 | ||||
| def tearDown(self): | def tearDown(self): | ||||
| @@ -130,10 +130,16 @@ class TestFinetuneTextGeneration(unittest.TestCase): | |||||
| def test_finetune_cnndm(self): | def test_finetune_cnndm(self): | ||||
| from modelscope.msdatasets import MsDataset | from modelscope.msdatasets import MsDataset | ||||
| dataset_dict = MsDataset.load('DuReader_robust-QG') | dataset_dict = MsDataset.load('DuReader_robust-QG') | ||||
| train_dataset = dataset_dict['train'].to_hf_dataset() \ | |||||
| .rename_columns({'text1': 'src_txt', 'text2': 'tgt_txt'}) | |||||
| eval_dataset = dataset_dict['validation'].to_hf_dataset() \ | |||||
| .rename_columns({'text1': 'src_txt', 'text2': 'tgt_txt'}) | |||||
| train_dataset = dataset_dict['train'].remap_columns({ | |||||
| 'text1': 'src_txt', | |||||
| 'text2': 'tgt_txt' | |||||
| }) | |||||
| eval_dataset = dataset_dict['validation'].remap_columns({ | |||||
| 'text1': | |||||
| 'src_txt', | |||||
| 'text2': | |||||
| 'tgt_txt' | |||||
| }) | |||||
| num_warmup_steps = 200 | num_warmup_steps = 200 | ||||
| os.environ['LOCAL_RANK'] = '0' | os.environ['LOCAL_RANK'] = '0' | ||||