|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396 |
- using Microsoft.VisualStudio.TestTools.UnitTesting;
- using System.Linq;
- using static Tensorflow.KerasApi;
-
- namespace Tensorflow.Keras.UnitTest
- {
- [TestClass]
- public class PreprocessingTests : EagerModeTestBase
- {
- private readonly string[] texts = new string[] {
- "It was the best of times, it was the worst of times.",
- "Mr and Mrs Dursley of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much.",
- "It was the best of times, it was the worst of times.",
- "Mr and Mrs Dursley of number four, Privet Drive.",
- };
-
- private readonly string[][] tokenized_texts = new string[][] {
- new string[] {"It","was","the","best","of","times","it","was","the","worst","of","times"},
- new string[] {"mr","and","mrs","dursley","of","number","four","privet","drive","were","proud","to","say","that","they","were","perfectly","normal","thank","you","very","much"},
- new string[] {"It","was","the","best","of","times","it","was","the","worst","of","times"},
- new string[] {"mr","and","mrs","dursley","of","number","four","privet","drive"},
- };
-
- private readonly string[] processed_texts = new string[] {
- "it was the best of times it was the worst of times",
- "mr and mrs dursley of number four privet drive were proud to say that they were perfectly normal thank you very much",
- "it was the best of times it was the worst of times",
- "mr and mrs dursley of number four privet drive",
- };
-
- private const string OOV = "<OOV>";
-
- [TestMethod]
- public void TokenizeWithNoOOV()
- {
- var tokenizer = keras.preprocessing.text.Tokenizer();
- tokenizer.fit_on_texts(texts);
-
- Assert.AreEqual(27, tokenizer.word_index.Count);
-
- Assert.AreEqual(7, tokenizer.word_index["worst"]);
- Assert.AreEqual(12, tokenizer.word_index["number"]);
- Assert.AreEqual(16, tokenizer.word_index["were"]);
- }
-
- [TestMethod]
- public void TokenizeWithNoOOV_Tkn()
- {
- var tokenizer = keras.preprocessing.text.Tokenizer();
- // Use the list version, where the tokenization has already been done.
- tokenizer.fit_on_texts(tokenized_texts);
-
- Assert.AreEqual(27, tokenizer.word_index.Count);
-
- Assert.AreEqual(7, tokenizer.word_index["worst"]);
- Assert.AreEqual(12, tokenizer.word_index["number"]);
- Assert.AreEqual(16, tokenizer.word_index["were"]);
- }
-
- [TestMethod]
- public void TokenizeWithOOV()
- {
- var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV);
- tokenizer.fit_on_texts(texts);
-
- Assert.AreEqual(28, tokenizer.word_index.Count);
-
- Assert.AreEqual(1, tokenizer.word_index[OOV]);
- Assert.AreEqual(8, tokenizer.word_index["worst"]);
- Assert.AreEqual(13, tokenizer.word_index["number"]);
- Assert.AreEqual(17, tokenizer.word_index["were"]);
- }
-
- [TestMethod]
- public void TokenizeWithOOV_Tkn()
- {
- var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV);
- // Use the list version, where the tokenization has already been done.
- tokenizer.fit_on_texts(tokenized_texts);
-
- Assert.AreEqual(28, tokenizer.word_index.Count);
-
- Assert.AreEqual(1, tokenizer.word_index[OOV]);
- Assert.AreEqual(8, tokenizer.word_index["worst"]);
- Assert.AreEqual(13, tokenizer.word_index["number"]);
- Assert.AreEqual(17, tokenizer.word_index["were"]);
- }
-
- [TestMethod]
- public void TokenizeTextsToSequences()
- {
- var tokenizer = keras.preprocessing.text.Tokenizer();
- tokenizer.fit_on_texts(texts);
-
- var sequences = tokenizer.texts_to_sequences(texts);
- Assert.AreEqual(4, sequences.Count);
-
- Assert.AreEqual(tokenizer.word_index["worst"], sequences[0][9]);
- Assert.AreEqual(tokenizer.word_index["proud"], sequences[1][10]);
- }
-
- [TestMethod]
- public void TokenizeTextsToSequences_Tkn()
- {
- var tokenizer = keras.preprocessing.text.Tokenizer();
- // Use the list version, where the tokenization has already been done.
- tokenizer.fit_on_texts(tokenized_texts);
-
- var sequences = tokenizer.texts_to_sequences(tokenized_texts);
- Assert.AreEqual(4, sequences.Count);
-
- Assert.AreEqual(tokenizer.word_index["worst"], sequences[0][9]);
- Assert.AreEqual(tokenizer.word_index["proud"], sequences[1][10]);
- }
-
- [TestMethod]
- public void TokenizeTextsToSequencesAndBack()
- {
- var tokenizer = keras.preprocessing.text.Tokenizer();
- tokenizer.fit_on_texts(texts);
-
- var sequences = tokenizer.texts_to_sequences(texts);
- Assert.AreEqual(4, sequences.Count);
-
- var processed = tokenizer.sequences_to_texts(sequences);
-
- Assert.AreEqual(4, processed.Count);
-
- for (var i = 0; i < processed.Count; i++)
- Assert.AreEqual(processed_texts[i], processed[i]);
- }
-
- [TestMethod]
- public void TokenizeTextsToSequencesAndBack_Tkn1()
- {
- var tokenizer = keras.preprocessing.text.Tokenizer();
- // Use the list version, where the tokenization has already been done.
- tokenizer.fit_on_texts(tokenized_texts);
-
- // Use the list version, where the tokenization has already been done.
- var sequences = tokenizer.texts_to_sequences(tokenized_texts);
- Assert.AreEqual(4, sequences.Count);
-
- var processed = tokenizer.sequences_to_texts(sequences);
-
- Assert.AreEqual(4, processed.Count);
-
- for (var i = 0; i < processed.Count; i++)
- Assert.AreEqual(processed_texts[i], processed[i]);
- }
-
- [TestMethod]
- public void TokenizeTextsToSequencesAndBack_Tkn2()
- {
- var tokenizer = keras.preprocessing.text.Tokenizer();
- // Use the list version, where the tokenization has already been done.
- tokenizer.fit_on_texts(tokenized_texts);
-
- var sequences = tokenizer.texts_to_sequences(texts);
- Assert.AreEqual(4, sequences.Count);
-
- var processed = tokenizer.sequences_to_texts(sequences);
-
- Assert.AreEqual(4, processed.Count);
-
- for (var i = 0; i < processed.Count; i++)
- Assert.AreEqual(processed_texts[i], processed[i]);
- }
-
- [TestMethod]
- public void TokenizeTextsToSequencesAndBack_Tkn3()
- {
- var tokenizer = keras.preprocessing.text.Tokenizer();
- tokenizer.fit_on_texts(texts);
-
- // Use the list version, where the tokenization has already been done.
- var sequences = tokenizer.texts_to_sequences(tokenized_texts);
- Assert.AreEqual(4, sequences.Count);
-
- var processed = tokenizer.sequences_to_texts(sequences);
-
- Assert.AreEqual(4, processed.Count);
-
- for (var i = 0; i < processed.Count; i++)
- Assert.AreEqual(processed_texts[i], processed[i]);
- }
- [TestMethod]
- public void TokenizeTextsToSequencesWithOOV()
- {
- var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV);
- tokenizer.fit_on_texts(texts);
-
- var sequences = tokenizer.texts_to_sequences(texts);
- Assert.AreEqual(4, sequences.Count);
-
- Assert.AreEqual(tokenizer.word_index["worst"], sequences[0][9]);
- Assert.AreEqual(tokenizer.word_index["proud"], sequences[1][10]);
-
- for (var i = 0; i < sequences.Count; i++)
- for (var j = 0; j < sequences[i].Length; j++)
- Assert.AreNotEqual(tokenizer.word_index[OOV], sequences[i][j]);
- }
-
- [TestMethod]
- public void TokenizeTextsToSequencesWithOOVPresent()
- {
- var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV, num_words: 20);
- tokenizer.fit_on_texts(texts);
-
- var sequences = tokenizer.texts_to_sequences(texts);
- Assert.AreEqual(4, sequences.Count);
-
- Assert.AreEqual(tokenizer.word_index["worst"], sequences[0][9]);
- Assert.AreEqual(tokenizer.word_index["proud"], sequences[1][10]);
-
- var oov_count = 0;
- for (var i = 0; i < sequences.Count; i++)
- for (var j = 0; j < sequences[i].Length; j++)
- if (tokenizer.word_index[OOV] == sequences[i][j])
- oov_count += 1;
-
- Assert.AreEqual(9, oov_count);
- }
-
- [TestMethod]
- public void PadSequencesWithDefaults()
- {
- var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV);
- tokenizer.fit_on_texts(texts);
-
- var sequences = tokenizer.texts_to_sequences(texts);
- var padded = keras.preprocessing.sequence.pad_sequences(sequences);
-
- Assert.AreEqual(4, padded.dims[0]);
- Assert.AreEqual(22, padded.dims[1]);
-
- Assert.AreEqual(padded[0, 19], tokenizer.word_index["worst"]);
- for (var i = 0; i < 8; i++)
- Assert.AreEqual(padded[0, i], 0);
- Assert.AreEqual(padded[1, 10], tokenizer.word_index["proud"]);
- for (var i = 0; i < 20; i++)
- Assert.AreNotEqual(padded[1, i], 0);
- }
-
- [TestMethod]
- public void PadSequencesPrePaddingTrunc()
- {
- var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV);
- tokenizer.fit_on_texts(texts);
-
- var sequences = tokenizer.texts_to_sequences(texts);
- var padded = keras.preprocessing.sequence.pad_sequences(sequences, maxlen: 15);
-
- Assert.AreEqual(4, padded.dims[0]);
- Assert.AreEqual(15, padded.dims[1]);
-
- Assert.AreEqual(padded[0, 12], tokenizer.word_index["worst"]);
- for (var i = 0; i < 3; i++)
- Assert.AreEqual(padded[0, i], 0);
- Assert.AreEqual(padded[1, 3], tokenizer.word_index["proud"]);
- for (var i = 0; i < 15; i++)
- Assert.AreNotEqual(padded[1, i], 0);
- }
-
- [TestMethod]
- public void PadSequencesPrePaddingTrunc_Larger()
- {
- var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV);
- tokenizer.fit_on_texts(texts);
-
- var sequences = tokenizer.texts_to_sequences(texts);
- var padded = keras.preprocessing.sequence.pad_sequences(sequences, maxlen: 45);
-
- Assert.AreEqual(4, padded.dims[0]);
- Assert.AreEqual(45, padded.dims[1]);
-
- Assert.AreEqual(padded[0, 42], tokenizer.word_index["worst"]);
- for (var i = 0; i < 33; i++)
- Assert.AreEqual(padded[0, i], 0);
- Assert.AreEqual(padded[1, 33], tokenizer.word_index["proud"]);
- }
-
- [TestMethod]
- public void PadSequencesPostPaddingTrunc()
- {
- var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV);
- tokenizer.fit_on_texts(texts);
-
- var sequences = tokenizer.texts_to_sequences(texts);
- var padded = keras.preprocessing.sequence.pad_sequences(sequences, maxlen: 15, padding: "post", truncating: "post");
-
- Assert.AreEqual(4, padded.dims[0]);
- Assert.AreEqual(15, padded.dims[1]);
-
- Assert.AreEqual(padded[0, 9], tokenizer.word_index["worst"]);
- for (var i = 12; i < 15; i++)
- Assert.AreEqual(padded[0, i], 0);
- Assert.AreEqual(padded[1, 10], tokenizer.word_index["proud"]);
- for (var i = 0; i < 15; i++)
- Assert.AreNotEqual(padded[1, i], 0);
- }
-
- [TestMethod]
- public void PadSequencesPostPaddingTrunc_Larger()
- {
- var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV);
- tokenizer.fit_on_texts(texts);
-
- var sequences = tokenizer.texts_to_sequences(texts);
- var padded = keras.preprocessing.sequence.pad_sequences(sequences, maxlen: 45, padding: "post", truncating: "post");
-
- Assert.AreEqual(4, padded.dims[0]);
- Assert.AreEqual(45, padded.dims[1]);
-
- Assert.AreEqual(padded[0, 9], tokenizer.word_index["worst"]);
- for (var i = 32; i < 45; i++)
- Assert.AreEqual(padded[0, i], 0);
- Assert.AreEqual(padded[1, 10], tokenizer.word_index["proud"]);
- }
-
- [TestMethod]
- public void TextToMatrixBinary()
- {
- var tokenizer = keras.preprocessing.text.Tokenizer();
- tokenizer.fit_on_texts(texts);
-
- Assert.AreEqual(27, tokenizer.word_index.Count);
-
- var matrix = tokenizer.texts_to_matrix(texts);
-
- Assert.AreEqual(texts.Length, matrix.dims[0]);
-
- Assert.IsTrue(Enumerable.SequenceEqual(new double[] { 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, matrix[0].ToArray<double>()));
- Assert.IsTrue(Enumerable.SequenceEqual(new double[] { 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, matrix[1].ToArray<double>()));
- }
-
- [TestMethod]
- public void TextToMatrixCount()
- {
- var tokenizer = keras.preprocessing.text.Tokenizer();
- tokenizer.fit_on_texts(texts);
-
- Assert.AreEqual(27, tokenizer.word_index.Count);
-
- var matrix = tokenizer.texts_to_matrix(texts, mode: "count");
-
- Assert.AreEqual(texts.Length, matrix.dims[0]);
-
- Assert.IsTrue(Enumerable.SequenceEqual(new double[] { 0, 2, 2, 2, 1, 2, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, matrix[0].ToArray<double>()));
- Assert.IsTrue(Enumerable.SequenceEqual(new double[] { 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, matrix[1].ToArray<double>()));
- }
-
- [TestMethod]
- public void TextToMatrixFrequency()
- {
- var tokenizer = keras.preprocessing.text.Tokenizer();
- tokenizer.fit_on_texts(texts);
-
- Assert.AreEqual(27, tokenizer.word_index.Count);
-
- var matrix = tokenizer.texts_to_matrix(texts, mode: "freq");
-
- Assert.AreEqual(texts.Length, matrix.dims[0]);
-
- double t12 = 2.0 / 12.0;
- double o12 = 1.0 / 12.0;
- double t22 = 2.0 / 22.0;
- double o22 = 1.0 / 22.0;
-
- Assert.IsTrue(Enumerable.SequenceEqual(new double[] { 0, t12, t12, t12, o12, t12, t12, o12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, matrix[0].ToArray<double>()));
- Assert.IsTrue(Enumerable.SequenceEqual(new double[] { 0, 0, 0, 0, 0, o22, 0, 0, o22, o22, o22, o22, o22, o22, o22, o22, t22, o22, o22, o22, o22, o22, o22, o22, o22, o22, o22, o22 }, matrix[1].ToArray<double>()));
- }
-
- [TestMethod]
- public void TextToMatrixTDIDF()
- {
- var tokenizer = keras.preprocessing.text.Tokenizer();
- tokenizer.fit_on_texts(texts);
-
- Assert.AreEqual(27, tokenizer.word_index.Count);
-
- var matrix = tokenizer.texts_to_matrix(texts, mode: "tfidf");
-
- Assert.AreEqual(texts.Length, matrix.dims[0]);
-
- double t1 = 1.1736001944781467;
- double t2 = 0.69314718055994529;
- double t3 = 1.860112299086919;
- double t4 = 1.0986122886681098;
- double t5 = 0.69314718055994529;
-
- Assert.IsTrue(Enumerable.SequenceEqual(new double[] { 0, t1, t1, t1, t2, 0, t1, t2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, matrix[0].ToArray<double>()));
- Assert.IsTrue(Enumerable.SequenceEqual(new double[] { 0, 0, 0, 0, 0, 0, 0, 0, t5, t5, t5, t5, t5, t5, t5, t5, t3, t4, t4, t4, t4, t4, t4, t4, t4, t4, t4, t4 }, matrix[1].ToArray<double>()));
- }
- }
- }
|