You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

test_text_tokenizer.py 16 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380
  1. # Copyright 2020 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ==============================================================================
  15. """
  16. Testing UnicodeCharTokenizer op in DE
  17. """
  18. import numpy as np
  19. import mindspore.dataset as ds
  20. from mindspore import log as logger
  21. import mindspore.dataset.text as text
  22. DATA_FILE = "../data/dataset/testTokenizerData/1.txt"
  23. NORMALIZE_FILE = "../data/dataset/testTokenizerData/normalize.txt"
  24. REGEX_REPLACE_FILE = "../data/dataset/testTokenizerData/regex_replace.txt"
  25. REGEX_TOKENIZER_FILE = "../data/dataset/testTokenizerData/regex_tokenizer.txt"
  26. def split_by_unicode_char(input_strs):
  27. """
  28. Split utf-8 strings to unicode characters
  29. """
  30. out = []
  31. for s in input_strs:
  32. out.append([c for c in s])
  33. return out
  34. def test_unicode_char_tokenizer_default():
  35. """
  36. Test UnicodeCharTokenizer
  37. """
  38. input_strs = ("Welcome to Beijing!", "北京欢迎您!", "我喜欢English!", " ")
  39. dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
  40. tokenizer = text.UnicodeCharTokenizer()
  41. dataset = dataset.map(operations=tokenizer)
  42. tokens = []
  43. for i in dataset.create_dict_iterator():
  44. token = text.to_str(i['text']).tolist()
  45. tokens.append(token)
  46. logger.info("The out tokens is : {}".format(tokens))
  47. assert split_by_unicode_char(input_strs) == tokens
  48. def test_unicode_char_tokenizer_with_offsets():
  49. """
  50. Test UnicodeCharTokenizer
  51. """
  52. input_strs = ("Welcome to Beijing!", "北京欢迎您!", "我喜欢English!", " ")
  53. dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
  54. tokenizer = text.UnicodeCharTokenizer(with_offsets=True)
  55. dataset = dataset.map(input_columns=['text'], output_columns=['token', 'offsets_start', 'offsets_limit'],
  56. columns_order=['token', 'offsets_start', 'offsets_limit'], operations=tokenizer)
  57. tokens = []
  58. expected_offsets_start = [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18],
  59. [0, 3, 6, 9, 12, 15], [0, 3, 6, 9, 10, 11, 12, 13, 14, 15, 16], [0, 1]]
  60. expected_offsets_limit = [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
  61. [3, 6, 9, 12, 15, 18], [3, 6, 9, 10, 11, 12, 13, 14, 15, 16, 17], [1, 2]]
  62. count = 0
  63. for i in dataset.create_dict_iterator():
  64. token = text.to_str(i['token']).tolist()
  65. tokens.append(token)
  66. np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count])
  67. np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count])
  68. count += 1
  69. logger.info("The out tokens is : {}".format(tokens))
  70. assert split_by_unicode_char(input_strs) == tokens
  71. def test_whitespace_tokenizer_default():
  72. """
  73. Test WhitespaceTokenizer
  74. """
  75. whitespace_strs = [["Welcome", "to", "Beijing!"],
  76. ["北京欢迎您!"],
  77. ["我喜欢English!"],
  78. [""]]
  79. dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
  80. tokenizer = text.WhitespaceTokenizer()
  81. dataset = dataset.map(operations=tokenizer)
  82. tokens = []
  83. for i in dataset.create_dict_iterator():
  84. token = text.to_str(i['text']).tolist()
  85. tokens.append(token)
  86. logger.info("The out tokens is : {}".format(tokens))
  87. assert whitespace_strs == tokens
  88. def test_whitespace_tokenizer_with_offsets():
  89. """
  90. Test WhitespaceTokenizer
  91. """
  92. whitespace_strs = [["Welcome", "to", "Beijing!"],
  93. ["北京欢迎您!"],
  94. ["我喜欢English!"],
  95. [""]]
  96. dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
  97. tokenizer = text.WhitespaceTokenizer(with_offsets=True)
  98. dataset = dataset.map(input_columns=['text'], output_columns=['token', 'offsets_start', 'offsets_limit'],
  99. columns_order=['token', 'offsets_start', 'offsets_limit'], operations=tokenizer)
  100. tokens = []
  101. expected_offsets_start = [[0, 8, 11], [0], [0], [0]]
  102. expected_offsets_limit = [[7, 10, 19], [18], [17], [0]]
  103. count = 0
  104. for i in dataset.create_dict_iterator():
  105. token = text.to_str(i['token']).tolist()
  106. tokens.append(token)
  107. np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count])
  108. np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count])
  109. count += 1
  110. logger.info("The out tokens is : {}".format(tokens))
  111. assert whitespace_strs == tokens
  112. def test_unicode_script_tokenizer_default():
  113. """
  114. Test UnicodeScriptTokenizer when para keep_whitespace=False
  115. """
  116. unicode_script_strs = [["Welcome", "to", "Beijing", "!"],
  117. ["北京欢迎您", "!"],
  118. ["我喜欢", "English", "!"],
  119. [""]]
  120. dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
  121. tokenizer = text.UnicodeScriptTokenizer(keep_whitespace=False)
  122. dataset = dataset.map(operations=tokenizer)
  123. tokens = []
  124. for i in dataset.create_dict_iterator():
  125. token = text.to_str(i['text']).tolist()
  126. tokens.append(token)
  127. logger.info("The out tokens is : {}".format(tokens))
  128. assert unicode_script_strs == tokens
  129. def test_unicode_script_tokenizer_default2():
  130. """
  131. Test UnicodeScriptTokenizer when para keep_whitespace=True
  132. """
  133. unicode_script_strs2 = [["Welcome", " ", "to", " ", "Beijing", "!"],
  134. ["北京欢迎您", "!"],
  135. ["我喜欢", "English", "!"],
  136. [" "]]
  137. dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
  138. tokenizer = text.UnicodeScriptTokenizer(keep_whitespace=True)
  139. dataset = dataset.map(operations=tokenizer)
  140. tokens = []
  141. for i in dataset.create_dict_iterator():
  142. token = text.to_str(i['text']).tolist()
  143. tokens.append(token)
  144. logger.info("The out tokens is :", tokens)
  145. assert unicode_script_strs2 == tokens
  146. def test_unicode_script_tokenizer_with_offsets():
  147. """
  148. Test UnicodeScriptTokenizer when para keep_whitespace=False and with_offsets=True
  149. """
  150. unicode_script_strs = [["Welcome", "to", "Beijing", "!"],
  151. ["北京欢迎您", "!"],
  152. ["我喜欢", "English", "!"],
  153. [""]]
  154. dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
  155. tokenizer = text.UnicodeScriptTokenizer(keep_whitespace=False, with_offsets=True)
  156. dataset = dataset.map(input_columns=['text'], output_columns=['token', 'offsets_start', 'offsets_limit'],
  157. columns_order=['token', 'offsets_start', 'offsets_limit'], operations=tokenizer)
  158. tokens = []
  159. expected_offsets_start = [[0, 8, 11, 18], [0, 15], [0, 9, 16], [0]]
  160. expected_offsets_limit = [[7, 10, 18, 19], [15, 18], [9, 16, 17], [0]]
  161. count = 0
  162. for i in dataset.create_dict_iterator():
  163. token = text.to_str(i['token']).tolist()
  164. tokens.append(token)
  165. np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count])
  166. np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count])
  167. count += 1
  168. logger.info("The out tokens is : {}".format(tokens))
  169. assert unicode_script_strs == tokens
  170. def test_unicode_script_tokenizer_with_offsets2():
  171. """
  172. Test UnicodeScriptTokenizer when para keep_whitespace=True and with_offsets=True
  173. """
  174. unicode_script_strs2 = [["Welcome", " ", "to", " ", "Beijing", "!"],
  175. ["北京欢迎您", "!"],
  176. ["我喜欢", "English", "!"],
  177. [" "]]
  178. dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
  179. tokenizer = text.UnicodeScriptTokenizer(keep_whitespace=True, with_offsets=True)
  180. dataset = dataset.map(input_columns=['text'], output_columns=['token', 'offsets_start', 'offsets_limit'],
  181. columns_order=['token', 'offsets_start', 'offsets_limit'], operations=tokenizer)
  182. tokens = []
  183. expected_offsets_start = [[0, 7, 8, 10, 11, 18], [0, 15], [0, 9, 16], [0]]
  184. expected_offsets_limit = [[7, 8, 10, 11, 18, 19], [15, 18], [9, 16, 17], [2]]
  185. count = 0
  186. for i in dataset.create_dict_iterator():
  187. token = text.to_str(i['token']).tolist()
  188. tokens.append(token)
  189. np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count])
  190. np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count])
  191. count += 1
  192. logger.info("The out tokens is :", tokens)
  193. assert unicode_script_strs2 == tokens
  194. def test_case_fold():
  195. """
  196. Test CaseFold
  197. """
  198. expect_strs = ["welcome to beijing!", "北京欢迎您!", "我喜欢english!", " "]
  199. dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
  200. op = text.CaseFold()
  201. dataset = dataset.map(operations=op)
  202. lower_strs = []
  203. for i in dataset.create_dict_iterator():
  204. token = text.to_str(i['text']).tolist()
  205. lower_strs.append(token)
  206. assert lower_strs == expect_strs
  207. def test_normalize_utf8():
  208. """
  209. Test NormalizeUTF8
  210. """
  211. def normalize(normalize_form):
  212. dataset = ds.TextFileDataset(NORMALIZE_FILE, shuffle=False)
  213. normalize = text.NormalizeUTF8(normalize_form=normalize_form)
  214. dataset = dataset.map(operations=normalize)
  215. out_bytes = []
  216. out_texts = []
  217. for i in dataset.create_dict_iterator():
  218. out_bytes.append(i['text'])
  219. out_texts.append(text.to_str(i['text']).tolist())
  220. logger.info("The out bytes is : ", out_bytes)
  221. logger.info("The out texts is: ", out_texts)
  222. return out_bytes
  223. expect_normlize_data = [
  224. # NFC
  225. [b'\xe1\xb9\xa9', b'\xe1\xb8\x8d\xcc\x87', b'q\xcc\xa3\xcc\x87',
  226. b'\xef\xac\x81', b'2\xe2\x81\xb5', b'\xe1\xba\x9b\xcc\xa3'],
  227. # NFKC
  228. [b'\xe1\xb9\xa9', b'\xe1\xb8\x8d\xcc\x87', b'q\xcc\xa3\xcc\x87',
  229. b'fi', b'25', b'\xe1\xb9\xa9'],
  230. # NFD
  231. [b's\xcc\xa3\xcc\x87', b'd\xcc\xa3\xcc\x87', b'q\xcc\xa3\xcc\x87',
  232. b'\xef\xac\x81', b'2\xe2\x81\xb5', b'\xc5\xbf\xcc\xa3\xcc\x87'],
  233. # NFKD
  234. [b's\xcc\xa3\xcc\x87', b'd\xcc\xa3\xcc\x87', b'q\xcc\xa3\xcc\x87',
  235. b'fi', b'25', b's\xcc\xa3\xcc\x87']
  236. ]
  237. assert normalize(text.utils.NormalizeForm.NFC) == expect_normlize_data[0]
  238. assert normalize(text.utils.NormalizeForm.NFKC) == expect_normlize_data[1]
  239. assert normalize(text.utils.NormalizeForm.NFD) == expect_normlize_data[2]
  240. assert normalize(text.utils.NormalizeForm.NFKD) == expect_normlize_data[3]
  241. def test_regex_replace():
  242. """
  243. Test RegexReplace
  244. """
  245. def regex_replace(first, last, expect_str, pattern, replace):
  246. dataset = ds.TextFileDataset(REGEX_REPLACE_FILE, shuffle=False)
  247. if first > 1:
  248. dataset = dataset.skip(first - 1)
  249. if last >= first:
  250. dataset = dataset.take(last - first + 1)
  251. replace_op = text.RegexReplace(pattern, replace)
  252. dataset = dataset.map(operations=replace_op)
  253. out_text = []
  254. for i in dataset.create_dict_iterator():
  255. token = text.to_str(i['text']).tolist()
  256. out_text.append(token)
  257. logger.info("Out:", out_text)
  258. logger.info("Exp:", expect_str)
  259. assert expect_str == out_text
  260. regex_replace(1, 2, ['H____ W____', "L__'_ G_"], "\\p{Ll}", '_')
  261. regex_replace(3, 5, ['hello', 'world', '31:beijing'], "^(\\d:|b:)", "")
  262. regex_replace(6, 6, ["WelcometoChina!"], "\\s+", "")
  263. regex_replace(7, 8, ['我不想长大', 'WelcometoShenzhen!'], "\\p{Cc}|\\p{Cf}|\\s+", "")
  264. def test_regex_tokenizer_default():
  265. """
  266. Test RegexTokenizer
  267. """
  268. def regex_tokenizer(first, last, expect_str, delim_pattern, keep_delim_pattern):
  269. dataset = ds.TextFileDataset(REGEX_TOKENIZER_FILE, shuffle=False)
  270. if first > 1:
  271. dataset = dataset.skip(first - 1)
  272. if last >= first:
  273. dataset = dataset.take(last - first + 1)
  274. tokenizer_op = text.RegexTokenizer(delim_pattern, keep_delim_pattern)
  275. dataset = dataset.map(operations=tokenizer_op)
  276. out_text = []
  277. count = 0
  278. for i in dataset.create_dict_iterator():
  279. token = text.to_str(i['text']).tolist()
  280. np.testing.assert_array_equal(token, expect_str[count])
  281. count += 1
  282. out_text.append(token)
  283. logger.info("Out:", out_text)
  284. logger.info("Exp:", expect_str)
  285. regex_tokenizer(1, 1, [['Welcome', 'to', 'Shenzhen!']], "\\s+", "")
  286. regex_tokenizer(1, 1, [['Welcome', ' ', 'to', ' ', 'Shenzhen!']], "\\s+", "\\s+")
  287. regex_tokenizer(2, 2, [['北', '京', '欢', '迎', '您', '!Welcome to Beijing!']], r"\p{Han}", r"\p{Han}")
  288. regex_tokenizer(3, 3, [['12', '¥+', '36', '¥=?']], r"[\p{P}|\p{S}]+", r"[\p{P}|\p{S}]+")
  289. regex_tokenizer(3, 3, [['12', '36']], r"[\p{P}|\p{S}]+", "")
  290. regex_tokenizer(3, 3, [['¥+', '¥=?']], r"[\p{N}]+", "")
  291. def test_regex_tokenizer_with_offsets():
  292. """
  293. Test RegexTokenizer
  294. """
  295. def regex_tokenizer(first, last, expect_str, expected_offsets_start, expected_offsets_limit, delim_pattern,
  296. keep_delim_pattern):
  297. dataset = ds.TextFileDataset(REGEX_TOKENIZER_FILE, shuffle=False)
  298. if first > 1:
  299. dataset = dataset.skip(first - 1)
  300. if last >= first:
  301. dataset = dataset.take(last - first + 1)
  302. tokenizer_op = text.RegexTokenizer(delim_pattern, keep_delim_pattern, with_offsets=True)
  303. dataset = dataset.map(input_columns=['text'], output_columns=['token', 'offsets_start', 'offsets_limit'],
  304. columns_order=['token', 'offsets_start', 'offsets_limit'], operations=tokenizer_op)
  305. out_text = []
  306. count = 0
  307. for i in dataset.create_dict_iterator():
  308. token = text.to_str(i['token']).tolist()
  309. np.testing.assert_array_equal(token, expect_str[count])
  310. np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count])
  311. np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count])
  312. count += 1
  313. out_text.append(token)
  314. logger.info("Out:", out_text)
  315. logger.info("Exp:", expect_str)
  316. regex_tokenizer(1, 1, [['Welcome', 'to', 'Shenzhen!']], [[0, 8, 11]], [[7, 10, 20]], "\\s+", "")
  317. regex_tokenizer(1, 1, [['Welcome', ' ', 'to', ' ', 'Shenzhen!']], [[0, 7, 8, 10, 11]], [[7, 8, 10, 11, 20]],
  318. "\\s+", "\\s+")
  319. regex_tokenizer(2, 2, [['北', '京', '欢', '迎', '您', '!Welcome to Beijing!']], [[0, 3, 6, 9, 12, 15]],
  320. [[3, 6, 9, 12, 15, 35]], r"\p{Han}", r"\p{Han}")
  321. regex_tokenizer(3, 3, [['12', '¥+', '36', '¥=?']], [[0, 2, 6, 8]], [[2, 6, 8, 13]],
  322. r"[\p{P}|\p{S}]+", r"[\p{P}|\p{S}]+")
  323. regex_tokenizer(3, 3, [['12', '36']], [[0, 6]], [[2, 8]], r"[\p{P}|\p{S}]+", "")
  324. regex_tokenizer(3, 3, [['¥+', '¥=?']], [[2, 8]], [[6, 13]], r"[\p{N}]+", "")
  325. if __name__ == '__main__':
  326. test_unicode_char_tokenizer_default()
  327. test_unicode_char_tokenizer_with_offsets()
  328. test_whitespace_tokenizer_default()
  329. test_whitespace_tokenizer_with_offsets()
  330. test_unicode_script_tokenizer_default()
  331. test_unicode_script_tokenizer_default2()
  332. test_unicode_script_tokenizer_with_offsets()
  333. test_unicode_script_tokenizer_with_offsets2()
  334. test_case_fold()
  335. test_normalize_utf8()
  336. test_regex_replace()
  337. test_regex_tokenizer_default()
  338. test_regex_tokenizer_with_offsets()