You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

test_py_dataset.py 1.2 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344
  1. import unittest
  2. import datasets as hfdata
  3. from modelscope.pydatasets import PyDataset
  4. class PyDatasetTest(unittest.TestCase):
  5. def setUp(self):
  6. # ds1 initialized from in memory json
  7. self.json_data = {
  8. 'dummy': [{
  9. 'a': i,
  10. 'x': i * 10,
  11. 'c': i * 100
  12. } for i in range(1, 11)]
  13. }
  14. hfds1 = hfdata.Dataset.from_dict(self.json_data)
  15. self.ds1 = PyDataset.from_hf_dataset(hfds1)
  16. # ds2 initialized from hg hub
  17. hfds2 = hfdata.load_dataset(
  18. 'glue', 'mrpc', revision='2.0.0', split='train')
  19. self.ds2 = PyDataset.from_hf_dataset(hfds2)
  20. def tearDown(self):
  21. pass
  22. def test_to_hf_dataset(self):
  23. hfds = self.ds1.to_hf_dataset()
  24. hfds1 = hfdata.Dataset.from_dict(self.json_data)
  25. self.assertEqual(hfds.data, hfds1.data)
  26. # simple map function
  27. hfds = hfds.map(lambda e: {'new_feature': e['dummy']['a']})
  28. self.assertEqual(len(hfds['new_feature']), 10)
  29. hfds2 = self.ds2.to_hf_dataset()
  30. self.assertTrue(hfds2[0]['sentence1'].startswith('Amrozi'))
  31. if __name__ == '__main__':
  32. unittest.main()