|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300 |
- using Microsoft.VisualStudio.TestTools.UnitTesting;
- using System;
- using System.Linq;
- using System.Collections.Generic;
- using System.Text;
- using NumSharp;
- using static Tensorflow.KerasApi;
- using Tensorflow;
- using Tensorflow.Keras.Datasets;
-
- namespace TensorFlowNET.Keras.UnitTest
- {
- [TestClass]
- public class PreprocessingTests : EagerModeTestBase
- {
- private readonly string[] texts = new string[] {
- "It was the best of times, it was the worst of times.",
- "this is a new dawn, an era to follow the previous era. It can not be said to start anew.",
- "It was the best of times, it was the worst of times.",
- "this is a new dawn, an era to follow the previous era.",
- };
-
- private readonly string[][] tokenized_texts = new string[][] {
- new string[] {"It","was","the","best","of","times","it","was","the","worst","of","times"},
- new string[] {"this","is","a","new","dawn","an","era","to","follow","the","previous","era","It","can","not","be","said","to","start","anew" },
- new string[] {"It","was","the","best","of","times","it","was","the","worst","of","times"},
- new string[] {"this","is","a","new","dawn","an","era","to","follow","the","previous","era" },
- };
-
- private readonly string[] processed_texts = new string[] {
- "it was the best of times it was the worst of times",
- "this is a new dawn an era to follow the previous era it can not be said to start anew",
- "it was the best of times it was the worst of times",
- "this is a new dawn an era to follow the previous era",
- };
-
- private const string OOV = "<OOV>";
-
- [TestMethod]
- public void TokenizeWithNoOOV()
- {
- var tokenizer = keras.preprocessing.text.Tokenizer();
- tokenizer.fit_on_texts(texts);
-
- Assert.AreEqual(23, tokenizer.word_index.Count);
-
- Assert.AreEqual(7, tokenizer.word_index["worst"]);
- Assert.AreEqual(12, tokenizer.word_index["dawn"]);
- Assert.AreEqual(16, tokenizer.word_index["follow"]);
- }
-
- [TestMethod]
- public void TokenizeWithNoOOV_Tkn()
- {
- var tokenizer = keras.preprocessing.text.Tokenizer();
- // Use the list version, where the tokenization has already been done.
- tokenizer.fit_on_texts(tokenized_texts);
-
- Assert.AreEqual(23, tokenizer.word_index.Count);
-
- Assert.AreEqual(7, tokenizer.word_index["worst"]);
- Assert.AreEqual(12, tokenizer.word_index["dawn"]);
- Assert.AreEqual(16, tokenizer.word_index["follow"]);
- }
-
- [TestMethod]
- public void TokenizeWithOOV()
- {
- var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV);
- tokenizer.fit_on_texts(texts);
-
- Assert.AreEqual(24, tokenizer.word_index.Count);
-
- Assert.AreEqual(1, tokenizer.word_index[OOV]);
- Assert.AreEqual(8, tokenizer.word_index["worst"]);
- Assert.AreEqual(13, tokenizer.word_index["dawn"]);
- Assert.AreEqual(17, tokenizer.word_index["follow"]);
- }
-
- [TestMethod]
- public void TokenizeWithOOV_Tkn()
- {
- var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV);
- // Use the list version, where the tokenization has already been done.
- tokenizer.fit_on_texts(tokenized_texts);
-
- Assert.AreEqual(24, tokenizer.word_index.Count);
-
- Assert.AreEqual(1, tokenizer.word_index[OOV]);
- Assert.AreEqual(8, tokenizer.word_index["worst"]);
- Assert.AreEqual(13, tokenizer.word_index["dawn"]);
- Assert.AreEqual(17, tokenizer.word_index["follow"]);
- }
-
- [TestMethod]
- public void TokenizeTextsToSequences()
- {
- var tokenizer = keras.preprocessing.text.Tokenizer();
- tokenizer.fit_on_texts(texts);
-
- var sequences = tokenizer.texts_to_sequences(texts);
- Assert.AreEqual(4, sequences.Count);
-
- Assert.AreEqual(tokenizer.word_index["worst"], sequences[0][9]);
- Assert.AreEqual(tokenizer.word_index["previous"], sequences[1][10]);
- }
-
- [TestMethod]
- public void TokenizeTextsToSequences_Tkn()
- {
- var tokenizer = keras.preprocessing.text.Tokenizer();
- // Use the list version, where the tokenization has already been done.
- tokenizer.fit_on_texts(tokenized_texts);
-
- var sequences = tokenizer.texts_to_sequences(tokenized_texts);
- Assert.AreEqual(4, sequences.Count);
-
- Assert.AreEqual(tokenizer.word_index["worst"], sequences[0][9]);
- Assert.AreEqual(tokenizer.word_index["previous"], sequences[1][10]);
- }
-
- [TestMethod]
- public void TokenizeTextsToSequencesAndBack()
- {
- var tokenizer = keras.preprocessing.text.Tokenizer();
- tokenizer.fit_on_texts(texts);
-
- var sequences = tokenizer.texts_to_sequences(texts);
- Assert.AreEqual(4, sequences.Count);
-
- var processed = tokenizer.sequences_to_texts(sequences);
-
- Assert.AreEqual(4, processed.Count);
-
- for (var i = 0; i < processed.Count; i++)
- Assert.AreEqual(processed_texts[i], processed[i]);
- }
-
- [TestMethod]
- public void TokenizeTextsToSequencesAndBack_Tkn1()
- {
- var tokenizer = keras.preprocessing.text.Tokenizer();
- // Use the list version, where the tokenization has already been done.
- tokenizer.fit_on_texts(tokenized_texts);
-
- // Use the list version, where the tokenization has already been done.
- var sequences = tokenizer.texts_to_sequences(tokenized_texts);
- Assert.AreEqual(4, sequences.Count);
-
- var processed = tokenizer.sequences_to_texts(sequences);
-
- Assert.AreEqual(4, processed.Count);
-
- for (var i = 0; i < processed.Count; i++)
- Assert.AreEqual(processed_texts[i], processed[i]);
- }
-
- [TestMethod]
- public void TokenizeTextsToSequencesAndBack_Tkn2()
- {
- var tokenizer = keras.preprocessing.text.Tokenizer();
- // Use the list version, where the tokenization has already been done.
- tokenizer.fit_on_texts(tokenized_texts);
-
- var sequences = tokenizer.texts_to_sequences(texts);
- Assert.AreEqual(4, sequences.Count);
-
- var processed = tokenizer.sequences_to_texts(sequences);
-
- Assert.AreEqual(4, processed.Count);
-
- for (var i = 0; i < processed.Count; i++)
- Assert.AreEqual(processed_texts[i], processed[i]);
- }
-
- [TestMethod]
- public void TokenizeTextsToSequencesAndBack_Tkn3()
- {
- var tokenizer = keras.preprocessing.text.Tokenizer();
- tokenizer.fit_on_texts(texts);
-
- // Use the list version, where the tokenization has already been done.
- var sequences = tokenizer.texts_to_sequences(tokenized_texts);
- Assert.AreEqual(4, sequences.Count);
-
- var processed = tokenizer.sequences_to_texts(sequences);
-
- Assert.AreEqual(4, processed.Count);
-
- for (var i = 0; i < processed.Count; i++)
- Assert.AreEqual(processed_texts[i], processed[i]);
- }
- [TestMethod]
- public void TokenizeTextsToSequencesWithOOV()
- {
- var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV);
- tokenizer.fit_on_texts(texts);
-
- var sequences = tokenizer.texts_to_sequences(texts);
- Assert.AreEqual(4, sequences.Count);
-
- Assert.AreEqual(tokenizer.word_index["worst"], sequences[0][9]);
- Assert.AreEqual(tokenizer.word_index["previous"], sequences[1][10]);
-
- for (var i = 0; i < sequences.Count; i++)
- for (var j = 0; j < sequences[i].Length; j++)
- Assert.AreNotEqual(tokenizer.word_index[OOV], sequences[i][j]);
- }
-
- [TestMethod]
- public void TokenizeTextsToSequencesWithOOVPresent()
- {
- var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV, num_words:20);
- tokenizer.fit_on_texts(texts);
-
- var sequences = tokenizer.texts_to_sequences(texts);
- Assert.AreEqual(4, sequences.Count);
-
- Assert.AreEqual(tokenizer.word_index["worst"], sequences[0][9]);
- Assert.AreEqual(tokenizer.word_index["previous"], sequences[1][10]);
-
- var oov_count = 0;
- for (var i = 0; i < sequences.Count; i++)
- for (var j = 0; j < sequences[i].Length; j++)
- if (tokenizer.word_index[OOV] == sequences[i][j])
- oov_count += 1;
-
- Assert.AreEqual(5, oov_count);
- }
-
- [TestMethod]
- public void PadSequencesWithDefaults()
- {
- var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV);
- tokenizer.fit_on_texts(texts);
-
- var sequences = tokenizer.texts_to_sequences(texts);
- var padded = keras.preprocessing.sequence.pad_sequences(sequences);
-
- Assert.AreEqual(4, padded.shape[0]);
- Assert.AreEqual(20, padded.shape[1]);
-
- var firstRow = padded[0];
- var secondRow = padded[1];
-
- Assert.AreEqual(tokenizer.word_index["worst"], padded[0, 17].GetInt32());
- for (var i = 0; i < 8; i++)
- Assert.AreEqual(0, padded[0, i].GetInt32());
- Assert.AreEqual(tokenizer.word_index["previous"], padded[1, 10].GetInt32());
- for (var i = 0; i < 20; i++)
- Assert.AreNotEqual(0, padded[1, i].GetInt32());
- }
-
- [TestMethod]
- public void PadSequencesPrePaddingTrunc()
- {
- var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV);
- tokenizer.fit_on_texts(texts);
-
- var sequences = tokenizer.texts_to_sequences(texts);
- var padded = keras.preprocessing.sequence.pad_sequences(sequences,maxlen:15);
-
- Assert.AreEqual(4, padded.shape[0]);
- Assert.AreEqual(15, padded.shape[1]);
-
- var firstRow = padded[0];
- var secondRow = padded[1];
-
- Assert.AreEqual(tokenizer.word_index["worst"], padded[0, 12].GetInt32());
- for (var i = 0; i < 3; i++)
- Assert.AreEqual(0, padded[0, i].GetInt32());
- Assert.AreEqual(tokenizer.word_index["previous"], padded[1, 5].GetInt32());
- for (var i = 0; i < 15; i++)
- Assert.AreNotEqual(0, padded[1, i].GetInt32());
- }
-
- [TestMethod]
- public void PadSequencesPostPaddingTrunc()
- {
- var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV);
- tokenizer.fit_on_texts(texts);
-
- var sequences = tokenizer.texts_to_sequences(texts);
- var padded = keras.preprocessing.sequence.pad_sequences(sequences, maxlen: 15, padding: "post", truncating: "post");
-
- Assert.AreEqual(4, padded.shape[0]);
- Assert.AreEqual(15, padded.shape[1]);
-
- var firstRow = padded[0];
- var secondRow = padded[1];
-
- Assert.AreEqual(tokenizer.word_index["worst"], padded[0, 9].GetInt32());
- for (var i = 12; i < 15; i++)
- Assert.AreEqual(0, padded[0, i].GetInt32());
- Assert.AreEqual(tokenizer.word_index["previous"], padded[1, 10].GetInt32());
- for (var i = 0; i < 15; i++)
- Assert.AreNotEqual(0, padded[1, i].GetInt32());
- }
- }
- }
|