using Microsoft.VisualStudio.TestTools.UnitTesting; using System.Linq; using static Tensorflow.KerasApi; namespace Tensorflow.Keras.UnitTest { [TestClass] public class PreprocessingTests : EagerModeTestBase { private readonly string[] texts = new string[] { "It was the best of times, it was the worst of times.", "Mr and Mrs Dursley of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much.", "It was the best of times, it was the worst of times.", "Mr and Mrs Dursley of number four, Privet Drive.", }; private readonly string[][] tokenized_texts = new string[][] { new string[] {"It","was","the","best","of","times","it","was","the","worst","of","times"}, new string[] {"mr","and","mrs","dursley","of","number","four","privet","drive","were","proud","to","say","that","they","were","perfectly","normal","thank","you","very","much"}, new string[] {"It","was","the","best","of","times","it","was","the","worst","of","times"}, new string[] {"mr","and","mrs","dursley","of","number","four","privet","drive"}, }; private readonly string[] processed_texts = new string[] { "it was the best of times it was the worst of times", "mr and mrs dursley of number four privet drive were proud to say that they were perfectly normal thank you very much", "it was the best of times it was the worst of times", "mr and mrs dursley of number four privet drive", }; private const string OOV = ""; [TestMethod] public void TokenizeWithNoOOV() { var tokenizer = keras.preprocessing.text.Tokenizer(); tokenizer.fit_on_texts(texts); Assert.AreEqual(27, tokenizer.word_index.Count); Assert.AreEqual(7, tokenizer.word_index["worst"]); Assert.AreEqual(12, tokenizer.word_index["number"]); Assert.AreEqual(16, tokenizer.word_index["were"]); } [TestMethod] public void TokenizeWithNoOOV_Tkn() { var tokenizer = keras.preprocessing.text.Tokenizer(); // Use the list version, where the tokenization has already been done. tokenizer.fit_on_texts(tokenized_texts); Assert.AreEqual(27, tokenizer.word_index.Count); Assert.AreEqual(7, tokenizer.word_index["worst"]); Assert.AreEqual(12, tokenizer.word_index["number"]); Assert.AreEqual(16, tokenizer.word_index["were"]); } [TestMethod] public void TokenizeWithOOV() { var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV); tokenizer.fit_on_texts(texts); Assert.AreEqual(28, tokenizer.word_index.Count); Assert.AreEqual(1, tokenizer.word_index[OOV]); Assert.AreEqual(8, tokenizer.word_index["worst"]); Assert.AreEqual(13, tokenizer.word_index["number"]); Assert.AreEqual(17, tokenizer.word_index["were"]); } [TestMethod] public void TokenizeWithOOV_Tkn() { var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV); // Use the list version, where the tokenization has already been done. tokenizer.fit_on_texts(tokenized_texts); Assert.AreEqual(28, tokenizer.word_index.Count); Assert.AreEqual(1, tokenizer.word_index[OOV]); Assert.AreEqual(8, tokenizer.word_index["worst"]); Assert.AreEqual(13, tokenizer.word_index["number"]); Assert.AreEqual(17, tokenizer.word_index["were"]); } [TestMethod] public void TokenizeTextsToSequences() { var tokenizer = keras.preprocessing.text.Tokenizer(); tokenizer.fit_on_texts(texts); var sequences = tokenizer.texts_to_sequences(texts); Assert.AreEqual(4, sequences.Count); Assert.AreEqual(tokenizer.word_index["worst"], sequences[0][9]); Assert.AreEqual(tokenizer.word_index["proud"], sequences[1][10]); } [TestMethod] public void TokenizeTextsToSequences_Tkn() { var tokenizer = keras.preprocessing.text.Tokenizer(); // Use the list version, where the tokenization has already been done. tokenizer.fit_on_texts(tokenized_texts); var sequences = tokenizer.texts_to_sequences(tokenized_texts); Assert.AreEqual(4, sequences.Count); Assert.AreEqual(tokenizer.word_index["worst"], sequences[0][9]); Assert.AreEqual(tokenizer.word_index["proud"], sequences[1][10]); } [TestMethod] public void TokenizeTextsToSequencesAndBack() { var tokenizer = keras.preprocessing.text.Tokenizer(); tokenizer.fit_on_texts(texts); var sequences = tokenizer.texts_to_sequences(texts); Assert.AreEqual(4, sequences.Count); var processed = tokenizer.sequences_to_texts(sequences); Assert.AreEqual(4, processed.Count); for (var i = 0; i < processed.Count; i++) Assert.AreEqual(processed_texts[i], processed[i]); } [TestMethod] public void TokenizeTextsToSequencesAndBack_Tkn1() { var tokenizer = keras.preprocessing.text.Tokenizer(); // Use the list version, where the tokenization has already been done. tokenizer.fit_on_texts(tokenized_texts); // Use the list version, where the tokenization has already been done. var sequences = tokenizer.texts_to_sequences(tokenized_texts); Assert.AreEqual(4, sequences.Count); var processed = tokenizer.sequences_to_texts(sequences); Assert.AreEqual(4, processed.Count); for (var i = 0; i < processed.Count; i++) Assert.AreEqual(processed_texts[i], processed[i]); } [TestMethod] public void TokenizeTextsToSequencesAndBack_Tkn2() { var tokenizer = keras.preprocessing.text.Tokenizer(); // Use the list version, where the tokenization has already been done. tokenizer.fit_on_texts(tokenized_texts); var sequences = tokenizer.texts_to_sequences(texts); Assert.AreEqual(4, sequences.Count); var processed = tokenizer.sequences_to_texts(sequences); Assert.AreEqual(4, processed.Count); for (var i = 0; i < processed.Count; i++) Assert.AreEqual(processed_texts[i], processed[i]); } [TestMethod] public void TokenizeTextsToSequencesAndBack_Tkn3() { var tokenizer = keras.preprocessing.text.Tokenizer(); tokenizer.fit_on_texts(texts); // Use the list version, where the tokenization has already been done. var sequences = tokenizer.texts_to_sequences(tokenized_texts); Assert.AreEqual(4, sequences.Count); var processed = tokenizer.sequences_to_texts(sequences); Assert.AreEqual(4, processed.Count); for (var i = 0; i < processed.Count; i++) Assert.AreEqual(processed_texts[i], processed[i]); } [TestMethod] public void TokenizeTextsToSequencesWithOOV() { var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV); tokenizer.fit_on_texts(texts); var sequences = tokenizer.texts_to_sequences(texts); Assert.AreEqual(4, sequences.Count); Assert.AreEqual(tokenizer.word_index["worst"], sequences[0][9]); Assert.AreEqual(tokenizer.word_index["proud"], sequences[1][10]); for (var i = 0; i < sequences.Count; i++) for (var j = 0; j < sequences[i].Length; j++) Assert.AreNotEqual(tokenizer.word_index[OOV], sequences[i][j]); } [TestMethod] public void TokenizeTextsToSequencesWithOOVPresent() { var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV, num_words: 20); tokenizer.fit_on_texts(texts); var sequences = tokenizer.texts_to_sequences(texts); Assert.AreEqual(4, sequences.Count); Assert.AreEqual(tokenizer.word_index["worst"], sequences[0][9]); Assert.AreEqual(tokenizer.word_index["proud"], sequences[1][10]); var oov_count = 0; for (var i = 0; i < sequences.Count; i++) for (var j = 0; j < sequences[i].Length; j++) if (tokenizer.word_index[OOV] == sequences[i][j]) oov_count += 1; Assert.AreEqual(9, oov_count); } [TestMethod] public void PadSequencesWithDefaults() { var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV); tokenizer.fit_on_texts(texts); var sequences = tokenizer.texts_to_sequences(texts); var padded = keras.preprocessing.sequence.pad_sequences(sequences); Assert.AreEqual(4, padded.dims[0]); Assert.AreEqual(22, padded.dims[1]); Assert.AreEqual(padded[0, 19], tokenizer.word_index["worst"]); for (var i = 0; i < 8; i++) Assert.AreEqual(padded[0, i], 0); Assert.AreEqual(padded[1, 10], tokenizer.word_index["proud"]); for (var i = 0; i < 20; i++) Assert.AreNotEqual(padded[1, i], 0); } [TestMethod] public void PadSequencesPrePaddingTrunc() { var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV); tokenizer.fit_on_texts(texts); var sequences = tokenizer.texts_to_sequences(texts); var padded = keras.preprocessing.sequence.pad_sequences(sequences, maxlen: 15); Assert.AreEqual(4, padded.dims[0]); Assert.AreEqual(15, padded.dims[1]); Assert.AreEqual(padded[0, 12], tokenizer.word_index["worst"]); for (var i = 0; i < 3; i++) Assert.AreEqual(padded[0, i], 0); Assert.AreEqual(padded[1, 3], tokenizer.word_index["proud"]); for (var i = 0; i < 15; i++) Assert.AreNotEqual(padded[1, i], 0); } [TestMethod] public void PadSequencesPrePaddingTrunc_Larger() { var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV); tokenizer.fit_on_texts(texts); var sequences = tokenizer.texts_to_sequences(texts); var padded = keras.preprocessing.sequence.pad_sequences(sequences, maxlen: 45); Assert.AreEqual(4, padded.dims[0]); Assert.AreEqual(45, padded.dims[1]); Assert.AreEqual(padded[0, 42], tokenizer.word_index["worst"]); for (var i = 0; i < 33; i++) Assert.AreEqual(padded[0, i], 0); Assert.AreEqual(padded[1, 33], tokenizer.word_index["proud"]); } [TestMethod] public void PadSequencesPostPaddingTrunc() { var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV); tokenizer.fit_on_texts(texts); var sequences = tokenizer.texts_to_sequences(texts); var padded = keras.preprocessing.sequence.pad_sequences(sequences, maxlen: 15, padding: "post", truncating: "post"); Assert.AreEqual(4, padded.dims[0]); Assert.AreEqual(15, padded.dims[1]); Assert.AreEqual(padded[0, 9], tokenizer.word_index["worst"]); for (var i = 12; i < 15; i++) Assert.AreEqual(padded[0, i], 0); Assert.AreEqual(padded[1, 10], tokenizer.word_index["proud"]); for (var i = 0; i < 15; i++) Assert.AreNotEqual(padded[1, i], 0); } [TestMethod] public void PadSequencesPostPaddingTrunc_Larger() { var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV); tokenizer.fit_on_texts(texts); var sequences = tokenizer.texts_to_sequences(texts); var padded = keras.preprocessing.sequence.pad_sequences(sequences, maxlen: 45, padding: "post", truncating: "post"); Assert.AreEqual(4, padded.dims[0]); Assert.AreEqual(45, padded.dims[1]); Assert.AreEqual(padded[0, 9], tokenizer.word_index["worst"]); for (var i = 32; i < 45; i++) Assert.AreEqual(padded[0, i], 0); Assert.AreEqual(padded[1, 10], tokenizer.word_index["proud"]); } [TestMethod] public void TextToMatrixBinary() { var tokenizer = keras.preprocessing.text.Tokenizer(); tokenizer.fit_on_texts(texts); Assert.AreEqual(27, tokenizer.word_index.Count); var matrix = tokenizer.texts_to_matrix(texts); Assert.AreEqual(texts.Length, matrix.dims[0]); Assert.IsTrue(Enumerable.SequenceEqual(new double[] { 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, matrix[0].ToArray())); Assert.IsTrue(Enumerable.SequenceEqual(new double[] { 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, matrix[1].ToArray())); } [TestMethod] public void TextToMatrixCount() { var tokenizer = keras.preprocessing.text.Tokenizer(); tokenizer.fit_on_texts(texts); Assert.AreEqual(27, tokenizer.word_index.Count); var matrix = tokenizer.texts_to_matrix(texts, mode: "count"); Assert.AreEqual(texts.Length, matrix.dims[0]); Assert.IsTrue(Enumerable.SequenceEqual(new double[] { 0, 2, 2, 2, 1, 2, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, matrix[0].ToArray())); Assert.IsTrue(Enumerable.SequenceEqual(new double[] { 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, matrix[1].ToArray())); } [TestMethod] public void TextToMatrixFrequency() { var tokenizer = keras.preprocessing.text.Tokenizer(); tokenizer.fit_on_texts(texts); Assert.AreEqual(27, tokenizer.word_index.Count); var matrix = tokenizer.texts_to_matrix(texts, mode: "freq"); Assert.AreEqual(texts.Length, matrix.dims[0]); double t12 = 2.0 / 12.0; double o12 = 1.0 / 12.0; double t22 = 2.0 / 22.0; double o22 = 1.0 / 22.0; Assert.IsTrue(Enumerable.SequenceEqual(new double[] { 0, t12, t12, t12, o12, t12, t12, o12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, matrix[0].ToArray())); Assert.IsTrue(Enumerable.SequenceEqual(new double[] { 0, 0, 0, 0, 0, o22, 0, 0, o22, o22, o22, o22, o22, o22, o22, o22, t22, o22, o22, o22, o22, o22, o22, o22, o22, o22, o22, o22 }, matrix[1].ToArray())); } [TestMethod] public void TextToMatrixTDIDF() { var tokenizer = keras.preprocessing.text.Tokenizer(); tokenizer.fit_on_texts(texts); Assert.AreEqual(27, tokenizer.word_index.Count); var matrix = tokenizer.texts_to_matrix(texts, mode: "tfidf"); Assert.AreEqual(texts.Length, matrix.dims[0]); double t1 = 1.1736001944781467; double t2 = 0.69314718055994529; double t3 = 1.860112299086919; double t4 = 1.0986122886681098; double t5 = 0.69314718055994529; Assert.IsTrue(Enumerable.SequenceEqual(new double[] { 0, t1, t1, t1, t2, 0, t1, t2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, matrix[0].ToArray())); Assert.IsTrue(Enumerable.SequenceEqual(new double[] { 0, 0, 0, 0, 0, 0, 0, 0, t5, t5, t5, t5, t5, t5, t5, t5, t3, t4, t4, t4, t4, t4, t4, t4, t4, t4, t4, t4 }, matrix[1].ToArray())); } } }