You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

test_datasets_csv.py 7.7 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238
  1. # Copyright 2020 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ==============================================================================
  15. import mindspore.dataset as ds
  16. import numpy as np
  17. import pytest
  18. DATA_FILE = '../data/dataset/testCSV/1.csv'
  19. def test_csv_dataset_basic():
  20. """
  21. Test CSV with repeat, skip and so on
  22. """
  23. TRAIN_FILE = '../data/dataset/testCSV/1.csv'
  24. buffer = []
  25. data = ds.CSVDataset(
  26. TRAIN_FILE,
  27. column_defaults=["0", 0, 0.0, "0"],
  28. column_names=['1', '2', '3', '4'],
  29. shuffle=False)
  30. data = data.repeat(2)
  31. data = data.skip(2)
  32. for d in data.create_dict_iterator():
  33. buffer.append(d)
  34. assert len(buffer) == 4
  35. def test_csv_dataset_one_file():
  36. data = ds.CSVDataset(
  37. DATA_FILE,
  38. column_defaults=["1", "2", "3", "4"],
  39. column_names=['col1', 'col2', 'col3', 'col4'],
  40. shuffle=False)
  41. buffer = []
  42. for d in data.create_dict_iterator():
  43. buffer.append(d)
  44. assert len(buffer) == 3
  45. def test_csv_dataset_all_file():
  46. APPEND_FILE = '../data/dataset/testCSV/2.csv'
  47. data = ds.CSVDataset(
  48. [DATA_FILE, APPEND_FILE],
  49. column_defaults=["1", "2", "3", "4"],
  50. column_names=['col1', 'col2', 'col3', 'col4'],
  51. shuffle=False)
  52. buffer = []
  53. for d in data.create_dict_iterator():
  54. buffer.append(d)
  55. assert len(buffer) == 10
  56. def test_csv_dataset_num_samples():
  57. data = ds.CSVDataset(
  58. DATA_FILE,
  59. column_defaults=["1", "2", "3", "4"],
  60. column_names=['col1', 'col2', 'col3', 'col4'],
  61. shuffle=False, num_samples=2)
  62. count = 0
  63. for _ in data.create_dict_iterator():
  64. count += 1
  65. assert count == 2
  66. def test_csv_dataset_distribution():
  67. TEST_FILE = '../data/dataset/testCSV/1.csv'
  68. data = ds.CSVDataset(
  69. TEST_FILE,
  70. column_defaults=["1", "2", "3", "4"],
  71. column_names=['col1', 'col2', 'col3', 'col4'],
  72. shuffle=False, num_shards=2, shard_id=0)
  73. count = 0
  74. for _ in data.create_dict_iterator():
  75. count += 1
  76. assert count == 2
  77. def test_csv_dataset_quoted():
  78. TEST_FILE = '../data/dataset/testCSV/quoted.csv'
  79. data = ds.CSVDataset(
  80. TEST_FILE,
  81. column_defaults=["", "", "", ""],
  82. column_names=['col1', 'col2', 'col3', 'col4'],
  83. shuffle=False)
  84. buffer = []
  85. for d in data.create_dict_iterator():
  86. buffer.extend([d['col1'].item().decode("utf8"),
  87. d['col2'].item().decode("utf8"),
  88. d['col3'].item().decode("utf8"),
  89. d['col4'].item().decode("utf8")])
  90. assert buffer == ['a', 'b', 'c', 'd']
  91. def test_csv_dataset_separated():
  92. TEST_FILE = '../data/dataset/testCSV/separated.csv'
  93. data = ds.CSVDataset(
  94. TEST_FILE,
  95. field_delim='|',
  96. column_defaults=["", "", "", ""],
  97. column_names=['col1', 'col2', 'col3', 'col4'],
  98. shuffle=False)
  99. buffer = []
  100. for d in data.create_dict_iterator():
  101. buffer.extend([d['col1'].item().decode("utf8"),
  102. d['col2'].item().decode("utf8"),
  103. d['col3'].item().decode("utf8"),
  104. d['col4'].item().decode("utf8")])
  105. assert buffer == ['a', 'b', 'c', 'd']
  106. def test_csv_dataset_embedded():
  107. TEST_FILE = '../data/dataset/testCSV/embedded.csv'
  108. data = ds.CSVDataset(
  109. TEST_FILE,
  110. column_defaults=["", "", "", ""],
  111. column_names=['col1', 'col2', 'col3', 'col4'],
  112. shuffle=False)
  113. buffer = []
  114. for d in data.create_dict_iterator():
  115. buffer.extend([d['col1'].item().decode("utf8"),
  116. d['col2'].item().decode("utf8"),
  117. d['col3'].item().decode("utf8"),
  118. d['col4'].item().decode("utf8")])
  119. assert buffer == ['a,b', 'c"d', 'e\nf', ' g ']
  120. def test_csv_dataset_chinese():
  121. TEST_FILE = '../data/dataset/testCSV/chinese.csv'
  122. data = ds.CSVDataset(
  123. TEST_FILE,
  124. column_defaults=["", "", "", "", ""],
  125. column_names=['col1', 'col2', 'col3', 'col4', 'col5'],
  126. shuffle=False)
  127. buffer = []
  128. for d in data.create_dict_iterator():
  129. buffer.extend([d['col1'].item().decode("utf8"),
  130. d['col2'].item().decode("utf8"),
  131. d['col3'].item().decode("utf8"),
  132. d['col4'].item().decode("utf8"),
  133. d['col5'].item().decode("utf8")])
  134. assert buffer == ['大家', '早上好', '中午好', '下午好', '晚上好']
  135. def test_csv_dataset_header():
  136. TEST_FILE = '../data/dataset/testCSV/header.csv'
  137. data = ds.CSVDataset(
  138. TEST_FILE,
  139. column_defaults=["", "", "", ""],
  140. shuffle=False)
  141. buffer = []
  142. for d in data.create_dict_iterator():
  143. buffer.extend([d['col1'].item().decode("utf8"),
  144. d['col2'].item().decode("utf8"),
  145. d['col3'].item().decode("utf8"),
  146. d['col4'].item().decode("utf8")])
  147. assert buffer == ['a', 'b', 'c', 'd']
  148. def test_csv_dataset_number():
  149. TEST_FILE = '../data/dataset/testCSV/number.csv'
  150. data = ds.CSVDataset(
  151. TEST_FILE,
  152. column_defaults=[0.0, 0.0, 0, 0.0],
  153. column_names=['col1', 'col2', 'col3', 'col4'],
  154. shuffle=False)
  155. buffer = []
  156. for d in data.create_dict_iterator():
  157. buffer.extend([d['col1'].item(),
  158. d['col2'].item(),
  159. d['col3'].item(),
  160. d['col4'].item()])
  161. assert np.allclose(buffer, [3.0, 0.3, 4, 55.5])
  162. def test_csv_dataset_size():
  163. TEST_FILE = '../data/dataset/testCSV/size.csv'
  164. data = ds.CSVDataset(
  165. TEST_FILE,
  166. column_defaults=[0.0, 0.0, 0, 0.0],
  167. column_names=['col1', 'col2', 'col3', 'col4'],
  168. shuffle=False)
  169. assert data.get_dataset_size() == 5
  170. def test_csv_dataset_exception():
  171. TEST_FILE = '../data/dataset/testCSV/exception.csv'
  172. data = ds.CSVDataset(
  173. TEST_FILE,
  174. column_defaults=["", "", "", ""],
  175. column_names=['col1', 'col2', 'col3', 'col4'],
  176. shuffle=False)
  177. with pytest.raises(Exception) as err:
  178. for _ in data.create_dict_iterator():
  179. pass
  180. assert "Failed to parse CSV file" in str(err.value)
  181. def test_csv_dataset_type_error():
  182. TEST_FILE = '../data/dataset/testCSV/exception.csv'
  183. data = ds.CSVDataset(
  184. TEST_FILE,
  185. column_defaults=["", 0, "", ""],
  186. column_names=['col1', 'col2', 'col3', 'col4'],
  187. shuffle=False)
  188. with pytest.raises(Exception) as err:
  189. for _ in data.create_dict_iterator():
  190. pass
  191. assert "invalid argument of stoi" in str(err.value)
  192. if __name__ == "__main__":
  193. test_csv_dataset_basic()
  194. test_csv_dataset_one_file()
  195. test_csv_dataset_all_file()
  196. test_csv_dataset_num_samples()
  197. test_csv_dataset_distribution()
  198. test_csv_dataset_quoted()
  199. test_csv_dataset_separated()
  200. test_csv_dataset_embedded()
  201. test_csv_dataset_chinese()
  202. test_csv_dataset_header()
  203. test_csv_dataset_number()
  204. test_csv_dataset_size()
  205. test_csv_dataset_exception()
  206. test_csv_dataset_type_error()