- import unittest
-
- import datasets as hfdata
-
- from modelscope.pydatasets import PyDataset
-
-
- class PyDatasetTest(unittest.TestCase):
-
- def setUp(self):
- # ds1 initialized from in memory json
- self.json_data = {
- 'dummy': [{
- 'a': i,
- 'x': i * 10,
- 'c': i * 100
- } for i in range(1, 11)]
- }
- hfds1 = hfdata.Dataset.from_dict(self.json_data)
- self.ds1 = PyDataset.from_hf_dataset(hfds1)
-
- # ds2 initialized from hg hub
- hfds2 = hfdata.load_dataset(
- 'glue', 'mrpc', revision='2.0.0', split='train')
- self.ds2 = PyDataset.from_hf_dataset(hfds2)
-
- def tearDown(self):
- pass
-
- def test_to_hf_dataset(self):
- hfds = self.ds1.to_hf_dataset()
- hfds1 = hfdata.Dataset.from_dict(self.json_data)
- self.assertEqual(hfds.data, hfds1.data)
-
- # simple map function
- hfds = hfds.map(lambda e: {'new_feature': e['dummy']['a']})
- self.assertEqual(len(hfds['new_feature']), 10)
-
- hfds2 = self.ds2.to_hf_dataset()
- self.assertTrue(hfds2[0]['sentence1'].startswith('Amrozi'))
-
-
- if __name__ == '__main__':
- unittest.main()
|