Adding more implementations to Tokenizer -- seq->text, pre-tokenized texts, etc.

Added unit tests.
4 years ago · 0413e5197a
--- a/src/TensorFlowNET.Keras/Preprocessings/Tokenizer.cs
+++ b/src/TensorFlowNET.Keras/Preprocessings/Tokenizer.cs
@@ -9,6 +9,12 @@ using System.Text;

 namespace Tensorflow.Keras.Text
 {
    /// <summary>
    /// Text tokenization API.
    /// This class allows to vectorize a text corpus, by turning each text into either a sequence of integers 
    /// (each integer being the index of a token in a dictionary) or into a vector where the coefficient for 
    /// each token could be binary, based on word count, based on tf-idf...
    /// </summary>
    public class Tokenizer
    {
        private readonly int num_words;
@@ -47,6 +53,11 @@ namespace Tensorflow.Keras.Text
            this.analyzer = analyzer;
        }

        /// <summary>
        /// Updates internal vocabulary based on a list of texts. 
        /// </summary>
        /// <param name="texts"></param>
        /// <remarks>Required before using texts_to_sequences or texts_to_matrix.</remarks>
        public void fit_on_texts(IEnumerable<string> texts)
        {
            foreach (var text in texts)
@@ -81,16 +92,16 @@ namespace Tensorflow.Keras.Text
            var wcounts = word_counts.AsEnumerable().ToList();
            wcounts.Sort((kv1, kv2) => -kv1.Value.CompareTo(kv2.Value));

            var sorted_voc = (oov_token == null) ? new List<string>() : new List<string>(){oov_token};
            var sorted_voc = (oov_token == null) ? new List<string>() : new List<string>() { oov_token };
            sorted_voc.AddRange(word_counts.Select(kv => kv.Key));

            if (num_words > 0 -1)
            if (num_words > 0 - 1)
            {
                sorted_voc = sorted_voc.Take<string>((oov_token == null) ? num_words : num_words + 1).ToList();
            }

            word_index = new Dictionary<string, int>(sorted_voc.Count);
            index_word = new Dictionary<int,string>(sorted_voc.Count);
            index_word = new Dictionary<int, string>(sorted_voc.Count);
            index_docs = new Dictionary<int, int>(word_docs.Count);

            for (int i = 0; i < sorted_voc.Count; i++)
@@ -109,25 +120,98 @@ namespace Tensorflow.Keras.Text
            }
        }

        public void fit_on_texts(IEnumerable<IList<string>> texts)
        {
            foreach (var seq in texts)
            {
                foreach (var w in seq.Select(s => lower ? s.ToLower() : s))
                {
                    var count = 0;
                    word_counts.TryGetValue(w, out count);
                    word_counts[w] = count + 1;
                }

                foreach (var w in new HashSet<string>(word_counts.Keys))
                {
                    var count = 0;
                    word_docs.TryGetValue(w, out count);
                    word_docs[w] = count + 1;
                }
            }

            var wcounts = word_counts.AsEnumerable().ToList();
            wcounts.Sort((kv1, kv2) => -kv1.Value.CompareTo(kv2.Value));

            var sorted_voc = (oov_token == null) ? new List<string>() : new List<string>() { oov_token };
            sorted_voc.AddRange(word_counts.Select(kv => kv.Key));

            if (num_words > 0 - 1)
            {
                sorted_voc = sorted_voc.Take<string>((oov_token == null) ? num_words : num_words + 1).ToList();
            }

            word_index = new Dictionary<string, int>(sorted_voc.Count);
            index_word = new Dictionary<int, string>(sorted_voc.Count);
            index_docs = new Dictionary<int, int>(word_docs.Count);

            for (int i = 0; i < sorted_voc.Count; i++)
            {
                word_index.Add(sorted_voc[i], i + 1);
                index_word.Add(i + 1, sorted_voc[i]);
            }

            foreach (var kv in word_docs)
            {
                var idx = -1;
                if (word_index.TryGetValue(kv.Key, out idx))
                {
                    index_docs.Add(idx, kv.Value);
                }
            }
        }

        /// <summary>
        /// Updates internal vocabulary based on a list of sequences.
        /// </summary>
        /// <param name="sequences"></param>
        /// <remarks>Required before using sequences_to_matrix (if fit_on_texts was never called).</remarks>
        public void fit_on_sequences(IEnumerable<int[]> sequences)
        {
            throw new NotImplementedException("fit_on_sequences");
        }

        /// <summary>
        /// Transforms each string in texts to a sequence of integers.
        /// </summary>
        /// <param name="texts"></param>
        /// <returns></returns>
        /// <remarks>Only top num_words-1 most frequent words will be taken into account.Only words known by the tokenizer will be taken into account.</remarks>
        public IList<int[]> texts_to_sequences(IEnumerable<string> texts)
        {
            return texts_to_sequences_generator(texts).ToArray();
        }

        /// <summary>
        /// Transforms each token in texts to a sequence of integers.
        /// </summary>
        /// <param name="texts"></param>
        /// <returns></returns>
        /// <remarks>Only top num_words-1 most frequent words will be taken into account.Only words known by the tokenizer will be taken into account.</remarks>
        public IList<int[]> texts_to_sequences(IEnumerable<IList<string>> texts)
        {
            return texts_to_sequences_generator(texts).ToArray();
        }

        public IEnumerable<int[]> texts_to_sequences_generator(IEnumerable<string> texts)
        {
            int oov_index = -1;
            var _ = (oov_token != null) && word_index.TryGetValue(oov_token, out oov_index);

            return texts.Select(text => {

            return texts.Select(text =>
            {
                IEnumerable<string> seq = null;

                if (char_level) 
                if (char_level)
                {
                    throw new NotImplementedException("char_level == true");
                }
@@ -135,39 +219,101 @@ namespace Tensorflow.Keras.Text
                {
                    seq = analyzer(lower ? text.ToLower() : text);
                }
                var vect = new List<int>();
                foreach (var w in seq)

                return ConvertToSequence(oov_index, seq).ToArray();
            });
        }

        private List<int> ConvertToSequence(int oov_index, IEnumerable<string> seq)
        {
            var vect = new List<int>();
            foreach (var w in seq.Select(s => lower ? s.ToLower() : s))
            {
                var i = -1;
                if (word_index.TryGetValue(w, out i))
                {
                    if (num_words != -1 && i >= num_words)
                    {
                        if (oov_index != -1)
                        {
                            vect.Add(oov_index);
                        }
                    }
                    else
                    {
                        vect.Add(i);
                    }
                }
                else if(oov_index != -1)
                {
                    vect.Add(oov_index);
                }
            }

            return vect;
        }

        public IEnumerable<int[]> texts_to_sequences_generator(IEnumerable<IList<string>> texts)
        {
            int oov_index = -1;
            var _ = (oov_token != null) && word_index.TryGetValue(oov_token, out oov_index);
            return texts.Select(seq => ConvertToSequence(oov_index, seq).ToArray());
        }

        /// <summary>
        /// Transforms each sequence into a list of text.
        /// </summary>
        /// <param name="sequences"></param>
        /// <returns>A list of texts(strings)</returns>
        /// <remarks>Only top num_words-1 most frequent words will be taken into account.Only words known by the tokenizer will be taken into account.</remarks>
        public IList<string> sequences_to_texts(IEnumerable<int[]> sequences)
        {
            return sequences_to_texts_generator(sequences).ToArray();
        }

        public IEnumerable<string> sequences_to_texts_generator(IEnumerable<int[]> sequences)
        {
            int oov_index = -1;
            var _ = (oov_token != null) && word_index.TryGetValue(oov_token, out oov_index);

            return sequences.Select(seq =>
            {

                var bldr = new StringBuilder();
                for (var i = 0; i < seq.Length; i++)
                {
                    var i = -1;
                    if (word_index.TryGetValue(w, out i))
                    if (i > 0) bldr.Append(' ');

                    string word = null;
                    if (index_word.TryGetValue(seq[i], out word))
                    {
                        if (num_words != -1 && i >= num_words)
                        {
                            if (oov_index != -1)
                            {
                                vect.Add(oov_index);
                                bldr.Append(oov_token);
                            }
                        }
                        else
                        {
                            vect.Add(i);
                            bldr.Append(word);
                        }
                    }
                    else
                    else if (oov_index != -1)
                    {
                        vect.Add(oov_index);
                        bldr.Append(oov_token);
                    }
                }

                return vect.ToArray();
                return bldr.ToString();
            });
        }

        public IEnumerable<string> sequences_to_texts(IEnumerable<int[]> sequences)
        {
            throw new NotImplementedException("sequences_to_texts");
        }

        /// <summary>
        /// Converts a list of sequences into a Numpy matrix.
        /// </summary>
        /// <param name="sequences"></param>
        /// <returns></returns>
        public NDArray sequences_to_matrix(IEnumerable<int[]> sequences)
        {
            throw new NotImplementedException("sequences_to_matrix");
--- a/test/TensorFlowNET.Keras.UnitTest/PreprocessingTests.cs
+++ b/test/TensorFlowNET.Keras.UnitTest/PreprocessingTests.cs
@@ -19,12 +19,27 @@ namespace TensorFlowNET.Keras.UnitTest
                "It was the best of times, it was the worst of times.",
                "this is a new dawn, an era to follow the previous era.",
                };

        private readonly string[][] tokenized_texts = new string[][] {
                new string[] {"It","was","the","best","of","times","it","was","the","worst","of","times"},
                new string[] {"this","is","a","new","dawn","an","era","to","follow","the","previous","era","It","can","not","be","said","to","start","anew" },
                new string[] {"It","was","the","best","of","times","it","was","the","worst","of","times"},
                new string[] {"this","is","a","new","dawn","an","era","to","follow","the","previous","era" },
                };

        private readonly string[] processed_texts = new string[] {
                "it was the best of times it was the worst of times",
                "this is a new dawn an era to follow the previous era it can not be said to start anew",
                "it was the best of times it was the worst of times",
                "this is a new dawn an era to follow the previous era",
                };

        private const string OOV = "<OOV>";

        [TestMethod]
        public void TokenizeWithNoOOV()
        {
            var tokenizer = keras.preprocessing.text.Tokenizer(lower: true);
            var tokenizer = keras.preprocessing.text.Tokenizer();
            tokenizer.fit_on_texts(texts);

            Assert.AreEqual(23, tokenizer.word_index.Count);
@@ -34,10 +49,24 @@ namespace TensorFlowNET.Keras.UnitTest
            Assert.AreEqual(16, tokenizer.word_index["follow"]);
        }

        [TestMethod]
        public void TokenizeWithNoOOV_Tkn()
        {
            var tokenizer = keras.preprocessing.text.Tokenizer();
            // Use the list version, where the tokenization has already been done.
            tokenizer.fit_on_texts(tokenized_texts);

            Assert.AreEqual(23, tokenizer.word_index.Count);

            Assert.AreEqual(7, tokenizer.word_index["worst"]);
            Assert.AreEqual(12, tokenizer.word_index["dawn"]);
            Assert.AreEqual(16, tokenizer.word_index["follow"]);
        }

        [TestMethod]
        public void TokenizeWithOOV()
        {
            var tokenizer = keras.preprocessing.text.Tokenizer(lower: true, oov_token: OOV);
            var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV);
            tokenizer.fit_on_texts(texts);

            Assert.AreEqual(24, tokenizer.word_index.Count);
@@ -48,10 +77,161 @@ namespace TensorFlowNET.Keras.UnitTest
            Assert.AreEqual(17, tokenizer.word_index["follow"]);
        }

        [TestMethod]
        public void TokenizeWithOOV_Tkn()
        {
            var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV);
            // Use the list version, where the tokenization has already been done.
            tokenizer.fit_on_texts(tokenized_texts);

            Assert.AreEqual(24, tokenizer.word_index.Count);

            Assert.AreEqual(1, tokenizer.word_index[OOV]);
            Assert.AreEqual(8, tokenizer.word_index["worst"]);
            Assert.AreEqual(13, tokenizer.word_index["dawn"]);
            Assert.AreEqual(17, tokenizer.word_index["follow"]);
        }

        [TestMethod]
        public void TokenizeTextsToSequences()
        {
            var tokenizer = keras.preprocessing.text.Tokenizer();
            tokenizer.fit_on_texts(texts);

            var sequences = tokenizer.texts_to_sequences(texts);
            Assert.AreEqual(4, sequences.Count);

            Assert.AreEqual(tokenizer.word_index["worst"], sequences[0][9]);
            Assert.AreEqual(tokenizer.word_index["previous"], sequences[1][10]);
        }

        [TestMethod]
        public void TokenizeTextsToSequences_Tkn()
        {
            var tokenizer = keras.preprocessing.text.Tokenizer();
            // Use the list version, where the tokenization has already been done.
            tokenizer.fit_on_texts(tokenized_texts);

            var sequences = tokenizer.texts_to_sequences(tokenized_texts);
            Assert.AreEqual(4, sequences.Count);

            Assert.AreEqual(tokenizer.word_index["worst"], sequences[0][9]);
            Assert.AreEqual(tokenizer.word_index["previous"], sequences[1][10]);
        }

        [TestMethod]
        public void TokenizeTextsToSequencesAndBack()
        {
            var tokenizer = keras.preprocessing.text.Tokenizer();
            tokenizer.fit_on_texts(texts);

            var sequences = tokenizer.texts_to_sequences(texts);
            Assert.AreEqual(4, sequences.Count);

            var processed = tokenizer.sequences_to_texts(sequences);

            Assert.AreEqual(4, processed.Count);

            for (var i = 0; i < processed.Count; i++)
                Assert.AreEqual(processed_texts[i], processed[i]);
        }

        [TestMethod]
        public void TokenizeTextsToSequencesAndBack_Tkn1()
        {
            var tokenizer = keras.preprocessing.text.Tokenizer();
            // Use the list version, where the tokenization has already been done.
            tokenizer.fit_on_texts(tokenized_texts);

            // Use the list version, where the tokenization has already been done.
            var sequences = tokenizer.texts_to_sequences(tokenized_texts);
            Assert.AreEqual(4, sequences.Count);

            var processed = tokenizer.sequences_to_texts(sequences);

            Assert.AreEqual(4, processed.Count);

            for (var i = 0; i < processed.Count; i++)
                Assert.AreEqual(processed_texts[i], processed[i]);
        }

        [TestMethod]
        public void TokenizeTextsToSequencesAndBack_Tkn2()
        {
            var tokenizer = keras.preprocessing.text.Tokenizer();
            // Use the list version, where the tokenization has already been done.
            tokenizer.fit_on_texts(tokenized_texts);

            var sequences = tokenizer.texts_to_sequences(texts);
            Assert.AreEqual(4, sequences.Count);

            var processed = tokenizer.sequences_to_texts(sequences);

            Assert.AreEqual(4, processed.Count);

            for (var i = 0; i < processed.Count; i++)
                Assert.AreEqual(processed_texts[i], processed[i]);
        }

        [TestMethod]
        public void TokenizeTextsToSequencesAndBack_Tkn3()
        {
            var tokenizer = keras.preprocessing.text.Tokenizer();
            tokenizer.fit_on_texts(texts);

            // Use the list version, where the tokenization has already been done.
            var sequences = tokenizer.texts_to_sequences(tokenized_texts);
            Assert.AreEqual(4, sequences.Count);

            var processed = tokenizer.sequences_to_texts(sequences);

            Assert.AreEqual(4, processed.Count);

            for (var i = 0; i < processed.Count; i++)
                Assert.AreEqual(processed_texts[i], processed[i]);
        }
        [TestMethod]
        public void TokenizeTextsToSequencesWithOOV()
        {
            var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV);
            tokenizer.fit_on_texts(texts);

            var sequences = tokenizer.texts_to_sequences(texts);
            Assert.AreEqual(4, sequences.Count);

            Assert.AreEqual(tokenizer.word_index["worst"], sequences[0][9]);
            Assert.AreEqual(tokenizer.word_index["previous"], sequences[1][10]);

            for (var i = 0; i < sequences.Count; i++)
                for (var j = 0; j < sequences[i].Length; j++)
                Assert.AreNotEqual(tokenizer.word_index[OOV], sequences[i][j]);
        }

        [TestMethod]
        public void TokenizeTextsToSequencesWithOOVPresent()
        {
            var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV, num_words:20);
            tokenizer.fit_on_texts(texts);

            var sequences = tokenizer.texts_to_sequences(texts);
            Assert.AreEqual(4, sequences.Count);

            Assert.AreEqual(tokenizer.word_index["worst"], sequences[0][9]);
            Assert.AreEqual(tokenizer.word_index["previous"], sequences[1][10]);

            var oov_count = 0;
            for (var i = 0; i < sequences.Count; i++)
                for (var j = 0; j < sequences[i].Length; j++)
                    if (tokenizer.word_index[OOV] == sequences[i][j])
                        oov_count += 1;

            Assert.AreEqual(5, oov_count);
        }

        [TestMethod]
        public void PadSequencesWithDefaults()
        {
            var tokenizer = keras.preprocessing.text.Tokenizer(lower: true, oov_token: OOV);
            var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV);
            tokenizer.fit_on_texts(texts);

            var sequences = tokenizer.texts_to_sequences(texts);
@@ -74,7 +254,7 @@ namespace TensorFlowNET.Keras.UnitTest
        [TestMethod]
        public void PadSequencesPrePaddingTrunc()
        {
            var tokenizer = keras.preprocessing.text.Tokenizer(lower: true, oov_token: OOV);
            var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV);
            tokenizer.fit_on_texts(texts);

            var sequences = tokenizer.texts_to_sequences(texts);
@@ -97,7 +277,7 @@ namespace TensorFlowNET.Keras.UnitTest
        [TestMethod]
        public void PadSequencesPostPaddingTrunc()
        {
            var tokenizer = keras.preprocessing.text.Tokenizer(lower: true, oov_token: OOV);
            var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV);
            tokenizer.fit_on_texts(texts);

            var sequences = tokenizer.texts_to_sequences(texts);