You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

PreprocessingTests.cs 17 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413
  1. using Microsoft.VisualStudio.TestTools.UnitTesting;
  2. using System;
  3. using System.Linq;
  4. using System.Collections.Generic;
  5. using System.Text;
  6. using NumSharp;
  7. using static Tensorflow.KerasApi;
  8. using Tensorflow;
  9. using Tensorflow.Keras.Datasets;
  10. using Microsoft.Extensions.DependencyInjection;
  11. namespace TensorFlowNET.Keras.UnitTest
  12. {
  13. [TestClass]
  14. public class PreprocessingTests : EagerModeTestBase
  15. {
  16. private readonly string[] texts = new string[] {
  17. "It was the best of times, it was the worst of times.",
  18. "Mr and Mrs Dursley of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much.",
  19. "It was the best of times, it was the worst of times.",
  20. "Mr and Mrs Dursley of number four, Privet Drive.",
  21. };
  22. private readonly string[][] tokenized_texts = new string[][] {
  23. new string[] {"It","was","the","best","of","times","it","was","the","worst","of","times"},
  24. new string[] {"mr","and","mrs","dursley","of","number","four","privet","drive","were","proud","to","say","that","they","were","perfectly","normal","thank","you","very","much"},
  25. new string[] {"It","was","the","best","of","times","it","was","the","worst","of","times"},
  26. new string[] {"mr","and","mrs","dursley","of","number","four","privet","drive"},
  27. };
  28. private readonly string[] processed_texts = new string[] {
  29. "it was the best of times it was the worst of times",
  30. "mr and mrs dursley of number four privet drive were proud to say that they were perfectly normal thank you very much",
  31. "it was the best of times it was the worst of times",
  32. "mr and mrs dursley of number four privet drive",
  33. };
  34. private const string OOV = "<OOV>";
  35. [TestMethod]
  36. public void TokenizeWithNoOOV()
  37. {
  38. var tokenizer = keras.preprocessing.text.Tokenizer();
  39. tokenizer.fit_on_texts(texts);
  40. Assert.AreEqual(27, tokenizer.word_index.Count);
  41. Assert.AreEqual(7, tokenizer.word_index["worst"]);
  42. Assert.AreEqual(12, tokenizer.word_index["number"]);
  43. Assert.AreEqual(16, tokenizer.word_index["were"]);
  44. }
  45. [TestMethod]
  46. public void TokenizeWithNoOOV_Tkn()
  47. {
  48. var tokenizer = keras.preprocessing.text.Tokenizer();
  49. // Use the list version, where the tokenization has already been done.
  50. tokenizer.fit_on_texts(tokenized_texts);
  51. Assert.AreEqual(27, tokenizer.word_index.Count);
  52. Assert.AreEqual(7, tokenizer.word_index["worst"]);
  53. Assert.AreEqual(12, tokenizer.word_index["number"]);
  54. Assert.AreEqual(16, tokenizer.word_index["were"]);
  55. }
  56. [TestMethod]
  57. public void TokenizeWithOOV()
  58. {
  59. var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV);
  60. tokenizer.fit_on_texts(texts);
  61. Assert.AreEqual(28, tokenizer.word_index.Count);
  62. Assert.AreEqual(1, tokenizer.word_index[OOV]);
  63. Assert.AreEqual(8, tokenizer.word_index["worst"]);
  64. Assert.AreEqual(13, tokenizer.word_index["number"]);
  65. Assert.AreEqual(17, tokenizer.word_index["were"]);
  66. }
  67. [TestMethod]
  68. public void TokenizeWithOOV_Tkn()
  69. {
  70. var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV);
  71. // Use the list version, where the tokenization has already been done.
  72. tokenizer.fit_on_texts(tokenized_texts);
  73. Assert.AreEqual(28, tokenizer.word_index.Count);
  74. Assert.AreEqual(1, tokenizer.word_index[OOV]);
  75. Assert.AreEqual(8, tokenizer.word_index["worst"]);
  76. Assert.AreEqual(13, tokenizer.word_index["number"]);
  77. Assert.AreEqual(17, tokenizer.word_index["were"]);
  78. }
  79. [TestMethod]
  80. public void TokenizeTextsToSequences()
  81. {
  82. var tokenizer = keras.preprocessing.text.Tokenizer();
  83. tokenizer.fit_on_texts(texts);
  84. var sequences = tokenizer.texts_to_sequences(texts);
  85. Assert.AreEqual(4, sequences.Count);
  86. Assert.AreEqual(tokenizer.word_index["worst"], sequences[0][9]);
  87. Assert.AreEqual(tokenizer.word_index["proud"], sequences[1][10]);
  88. }
  89. [TestMethod]
  90. public void TokenizeTextsToSequences_Tkn()
  91. {
  92. var tokenizer = keras.preprocessing.text.Tokenizer();
  93. // Use the list version, where the tokenization has already been done.
  94. tokenizer.fit_on_texts(tokenized_texts);
  95. var sequences = tokenizer.texts_to_sequences(tokenized_texts);
  96. Assert.AreEqual(4, sequences.Count);
  97. Assert.AreEqual(tokenizer.word_index["worst"], sequences[0][9]);
  98. Assert.AreEqual(tokenizer.word_index["proud"], sequences[1][10]);
  99. }
  100. [TestMethod]
  101. public void TokenizeTextsToSequencesAndBack()
  102. {
  103. var tokenizer = keras.preprocessing.text.Tokenizer();
  104. tokenizer.fit_on_texts(texts);
  105. var sequences = tokenizer.texts_to_sequences(texts);
  106. Assert.AreEqual(4, sequences.Count);
  107. var processed = tokenizer.sequences_to_texts(sequences);
  108. Assert.AreEqual(4, processed.Count);
  109. for (var i = 0; i < processed.Count; i++)
  110. Assert.AreEqual(processed_texts[i], processed[i]);
  111. }
  112. [TestMethod]
  113. public void TokenizeTextsToSequencesAndBack_Tkn1()
  114. {
  115. var tokenizer = keras.preprocessing.text.Tokenizer();
  116. // Use the list version, where the tokenization has already been done.
  117. tokenizer.fit_on_texts(tokenized_texts);
  118. // Use the list version, where the tokenization has already been done.
  119. var sequences = tokenizer.texts_to_sequences(tokenized_texts);
  120. Assert.AreEqual(4, sequences.Count);
  121. var processed = tokenizer.sequences_to_texts(sequences);
  122. Assert.AreEqual(4, processed.Count);
  123. for (var i = 0; i < processed.Count; i++)
  124. Assert.AreEqual(processed_texts[i], processed[i]);
  125. }
  126. [TestMethod]
  127. public void TokenizeTextsToSequencesAndBack_Tkn2()
  128. {
  129. var tokenizer = keras.preprocessing.text.Tokenizer();
  130. // Use the list version, where the tokenization has already been done.
  131. tokenizer.fit_on_texts(tokenized_texts);
  132. var sequences = tokenizer.texts_to_sequences(texts);
  133. Assert.AreEqual(4, sequences.Count);
  134. var processed = tokenizer.sequences_to_texts(sequences);
  135. Assert.AreEqual(4, processed.Count);
  136. for (var i = 0; i < processed.Count; i++)
  137. Assert.AreEqual(processed_texts[i], processed[i]);
  138. }
  139. [TestMethod]
  140. public void TokenizeTextsToSequencesAndBack_Tkn3()
  141. {
  142. var tokenizer = keras.preprocessing.text.Tokenizer();
  143. tokenizer.fit_on_texts(texts);
  144. // Use the list version, where the tokenization has already been done.
  145. var sequences = tokenizer.texts_to_sequences(tokenized_texts);
  146. Assert.AreEqual(4, sequences.Count);
  147. var processed = tokenizer.sequences_to_texts(sequences);
  148. Assert.AreEqual(4, processed.Count);
  149. for (var i = 0; i < processed.Count; i++)
  150. Assert.AreEqual(processed_texts[i], processed[i]);
  151. }
  152. [TestMethod]
  153. public void TokenizeTextsToSequencesWithOOV()
  154. {
  155. var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV);
  156. tokenizer.fit_on_texts(texts);
  157. var sequences = tokenizer.texts_to_sequences(texts);
  158. Assert.AreEqual(4, sequences.Count);
  159. Assert.AreEqual(tokenizer.word_index["worst"], sequences[0][9]);
  160. Assert.AreEqual(tokenizer.word_index["proud"], sequences[1][10]);
  161. for (var i = 0; i < sequences.Count; i++)
  162. for (var j = 0; j < sequences[i].Length; j++)
  163. Assert.AreNotEqual(tokenizer.word_index[OOV], sequences[i][j]);
  164. }
  165. [TestMethod]
  166. public void TokenizeTextsToSequencesWithOOVPresent()
  167. {
  168. var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV, num_words:20);
  169. tokenizer.fit_on_texts(texts);
  170. var sequences = tokenizer.texts_to_sequences(texts);
  171. Assert.AreEqual(4, sequences.Count);
  172. Assert.AreEqual(tokenizer.word_index["worst"], sequences[0][9]);
  173. Assert.AreEqual(tokenizer.word_index["proud"], sequences[1][10]);
  174. var oov_count = 0;
  175. for (var i = 0; i < sequences.Count; i++)
  176. for (var j = 0; j < sequences[i].Length; j++)
  177. if (tokenizer.word_index[OOV] == sequences[i][j])
  178. oov_count += 1;
  179. Assert.AreEqual(9, oov_count);
  180. }
  181. [TestMethod]
  182. public void PadSequencesWithDefaults()
  183. {
  184. var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV);
  185. tokenizer.fit_on_texts(texts);
  186. var sequences = tokenizer.texts_to_sequences(texts);
  187. var padded = keras.preprocessing.sequence.pad_sequences(sequences);
  188. Assert.AreEqual(4, padded.shape[0]);
  189. Assert.AreEqual(22, padded.shape[1]);
  190. Assert.AreEqual(tokenizer.word_index["worst"], padded[0, 19].GetInt32());
  191. for (var i = 0; i < 8; i++)
  192. Assert.AreEqual(0, padded[0, i].GetInt32());
  193. Assert.AreEqual(tokenizer.word_index["proud"], padded[1, 10].GetInt32());
  194. for (var i = 0; i < 20; i++)
  195. Assert.AreNotEqual(0, padded[1, i].GetInt32());
  196. }
  197. [TestMethod]
  198. public void PadSequencesPrePaddingTrunc()
  199. {
  200. var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV);
  201. tokenizer.fit_on_texts(texts);
  202. var sequences = tokenizer.texts_to_sequences(texts);
  203. var padded = keras.preprocessing.sequence.pad_sequences(sequences,maxlen:15);
  204. Assert.AreEqual(4, padded.shape[0]);
  205. Assert.AreEqual(15, padded.shape[1]);
  206. Assert.AreEqual(tokenizer.word_index["worst"], padded[0, 12].GetInt32());
  207. for (var i = 0; i < 3; i++)
  208. Assert.AreEqual(0, padded[0, i].GetInt32());
  209. Assert.AreEqual(tokenizer.word_index["proud"], padded[1, 3].GetInt32());
  210. for (var i = 0; i < 15; i++)
  211. Assert.AreNotEqual(0, padded[1, i].GetInt32());
  212. }
  213. [TestMethod]
  214. public void PadSequencesPrePaddingTrunc_Larger()
  215. {
  216. var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV);
  217. tokenizer.fit_on_texts(texts);
  218. var sequences = tokenizer.texts_to_sequences(texts);
  219. var padded = keras.preprocessing.sequence.pad_sequences(sequences, maxlen: 45);
  220. Assert.AreEqual(4, padded.shape[0]);
  221. Assert.AreEqual(45, padded.shape[1]);
  222. Assert.AreEqual(tokenizer.word_index["worst"], padded[0, 42].GetInt32());
  223. for (var i = 0; i < 33; i++)
  224. Assert.AreEqual(0, padded[0, i].GetInt32());
  225. Assert.AreEqual(tokenizer.word_index["proud"], padded[1, 33].GetInt32());
  226. }
  227. [TestMethod]
  228. public void PadSequencesPostPaddingTrunc()
  229. {
  230. var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV);
  231. tokenizer.fit_on_texts(texts);
  232. var sequences = tokenizer.texts_to_sequences(texts);
  233. var padded = keras.preprocessing.sequence.pad_sequences(sequences, maxlen: 15, padding: "post", truncating: "post");
  234. Assert.AreEqual(4, padded.shape[0]);
  235. Assert.AreEqual(15, padded.shape[1]);
  236. Assert.AreEqual(tokenizer.word_index["worst"], padded[0, 9].GetInt32());
  237. for (var i = 12; i < 15; i++)
  238. Assert.AreEqual(0, padded[0, i].GetInt32());
  239. Assert.AreEqual(tokenizer.word_index["proud"], padded[1, 10].GetInt32());
  240. for (var i = 0; i < 15; i++)
  241. Assert.AreNotEqual(0, padded[1, i].GetInt32());
  242. }
  243. [TestMethod]
  244. public void PadSequencesPostPaddingTrunc_Larger()
  245. {
  246. var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV);
  247. tokenizer.fit_on_texts(texts);
  248. var sequences = tokenizer.texts_to_sequences(texts);
  249. var padded = keras.preprocessing.sequence.pad_sequences(sequences, maxlen: 45, padding: "post", truncating: "post");
  250. Assert.AreEqual(4, padded.shape[0]);
  251. Assert.AreEqual(45, padded.shape[1]);
  252. Assert.AreEqual(tokenizer.word_index["worst"], padded[0, 9].GetInt32());
  253. for (var i = 32; i < 45; i++)
  254. Assert.AreEqual(0, padded[0, i].GetInt32());
  255. Assert.AreEqual(tokenizer.word_index["proud"], padded[1, 10].GetInt32());
  256. }
  257. [TestMethod]
  258. public void TextToMatrixBinary()
  259. {
  260. var tokenizer = keras.preprocessing.text.Tokenizer();
  261. tokenizer.fit_on_texts(texts);
  262. Assert.AreEqual(27, tokenizer.word_index.Count);
  263. var matrix = tokenizer.texts_to_matrix(texts);
  264. Assert.AreEqual(texts.Length, matrix.shape[0]);
  265. CompareLists(new double[] { 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, matrix[0].ToArray<double>());
  266. CompareLists(new double[] { 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, matrix[1].ToArray<double>());
  267. }
  268. [TestMethod]
  269. public void TextToMatrixCount()
  270. {
  271. var tokenizer = keras.preprocessing.text.Tokenizer();
  272. tokenizer.fit_on_texts(texts);
  273. Assert.AreEqual(27, tokenizer.word_index.Count);
  274. var matrix = tokenizer.texts_to_matrix(texts, mode:"count");
  275. Assert.AreEqual(texts.Length, matrix.shape[0]);
  276. CompareLists(new double[] { 0, 2, 2, 2, 1, 2, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, matrix[0].ToArray<double>());
  277. CompareLists(new double[] { 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, matrix[1].ToArray<double>());
  278. }
  279. [TestMethod]
  280. public void TextToMatrixFrequency()
  281. {
  282. var tokenizer = keras.preprocessing.text.Tokenizer();
  283. tokenizer.fit_on_texts(texts);
  284. Assert.AreEqual(27, tokenizer.word_index.Count);
  285. var matrix = tokenizer.texts_to_matrix(texts, mode: "freq");
  286. Assert.AreEqual(texts.Length, matrix.shape[0]);
  287. double t12 = 2.0 / 12.0;
  288. double o12 = 1.0 / 12.0;
  289. double t22 = 2.0 / 22.0;
  290. double o22 = 1.0 / 22.0;
  291. CompareLists(new double[] { 0, t12, t12, t12, o12, t12, t12, o12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, matrix[0].ToArray<double>());
  292. CompareLists(new double[] { 0, 0, 0, 0, 0, o22, 0, 0, o22, o22, o22, o22, o22, o22, o22, o22, t22, o22, o22, o22, o22, o22, o22, o22, o22, o22, o22, o22 }, matrix[1].ToArray<double>());
  293. }
  294. [TestMethod]
  295. public void TextToMatrixTDIDF()
  296. {
  297. var tokenizer = keras.preprocessing.text.Tokenizer();
  298. tokenizer.fit_on_texts(texts);
  299. Assert.AreEqual(27, tokenizer.word_index.Count);
  300. var matrix = tokenizer.texts_to_matrix(texts, mode: "tfidf");
  301. Assert.AreEqual(texts.Length, matrix.shape[0]);
  302. double t1 = 1.1736001944781467;
  303. double t2 = 0.69314718055994529;
  304. double t3 = 1.860112299086919;
  305. double t4 = 1.0986122886681098;
  306. double t5 = 0.69314718055994529;
  307. CompareLists(new double[] { 0, t1, t1, t1, t2, 0, t1, t2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, matrix[0].ToArray<double>());
  308. CompareLists(new double[] { 0, 0, 0, 0, 0, 0, 0, 0, t5, t5, t5, t5, t5, t5, t5, t5, t3, t4, t4, t4, t4, t4, t4, t4, t4, t4, t4, t4 }, matrix[1].ToArray<double>());
  309. }
  310. private void CompareLists<T>(IList<T> expected, IList<T> actual)
  311. {
  312. Assert.AreEqual(expected.Count, actual.Count);
  313. for (var i = 0; i < expected.Count; i++)
  314. {
  315. Assert.AreEqual(expected[i], actual[i]);
  316. }
  317. }
  318. }
  319. }