From 0413e5197ae9ac04b8ebdf6c58b1da9d09bb02da Mon Sep 17 00:00:00 2001
From: Niklas Gustafsson <niklasg@microsoft.com>
Date: Fri, 29 Jan 2021 10:47:10 -0800
Subject: [PATCH] Adding more implementations to Tokenizer -- seq->text,
 pre-tokenized texts, etc. Added unit tests.

---
 .../Preprocessings/Tokenizer.cs               | 186 +++++++++++++++--
 .../PreprocessingTests.cs                     | 190 +++++++++++++++++-
 2 files changed, 351 insertions(+), 25 deletions(-)
diff --git a/src/TensorFlowNET.Keras/Preprocessings/Tokenizer.cs b/src/TensorFlowNET.Keras/Preprocessings/Tokenizer.cs
index d3bfeaac..aaca1cb9 100644
--- a/src/TensorFlowNET.Keras/Preprocessings/Tokenizer.cs
+++ b/src/TensorFlowNET.Keras/Preprocessings/Tokenizer.cs
@@ -9,6 +9,12 @@ using System.Text;
 
 namespace Tensorflow.Keras.Text
 {
+    /// <summary>
+    /// Text tokenization API.
+    /// This class allows to vectorize a text corpus, by turning each text into either a sequence of integers 
+    /// (each integer being the index of a token in a dictionary) or into a vector where the coefficient for 
+    /// each token could be binary, based on word count, based on tf-idf...
+    /// </summary>
     public class Tokenizer
     {
         private readonly int num_words;
@@ -47,6 +53,11 @@ namespace Tensorflow.Keras.Text
             this.analyzer = analyzer;
         }
 
+        /// <summary>
+        /// Updates internal vocabulary based on a list of texts. 
+        /// </summary>
+        /// <param name="texts"></param>
+        /// <remarks>Required before using texts_to_sequences or texts_to_matrix.</remarks>
         public void fit_on_texts(IEnumerable<string> texts)
         {
             foreach (var text in texts)
@@ -81,16 +92,16 @@ namespace Tensorflow.Keras.Text
             var wcounts = word_counts.AsEnumerable().ToList();
             wcounts.Sort((kv1, kv2) => -kv1.Value.CompareTo(kv2.Value));
 
-            var sorted_voc = (oov_token == null) ? new List<string>() : new List<string>(){oov_token};
+            var sorted_voc = (oov_token == null) ? new List<string>() : new List<string>() { oov_token };
             sorted_voc.AddRange(word_counts.Select(kv => kv.Key));
 
-            if (num_words > 0 -1)
+            if (num_words > 0 - 1)
             {
                 sorted_voc = sorted_voc.Take<string>((oov_token == null) ? num_words : num_words + 1).ToList();
             }
 
             word_index = new Dictionary<string, int>(sorted_voc.Count);
-            index_word = new Dictionary<int,string>(sorted_voc.Count);
+            index_word = new Dictionary<int, string>(sorted_voc.Count);
             index_docs = new Dictionary<int, int>(word_docs.Count);
 
             for (int i = 0; i < sorted_voc.Count; i++)
@@ -109,25 +120,98 @@ namespace Tensorflow.Keras.Text
             }
         }
 
+        public void fit_on_texts(IEnumerable<IList<string>> texts)
+        {
+            foreach (var seq in texts)
+            {
+                foreach (var w in seq.Select(s => lower ? s.ToLower() : s))
+                {
+                    var count = 0;
+                    word_counts.TryGetValue(w, out count);
+                    word_counts[w] = count + 1;
+                }
+
+                foreach (var w in new HashSet<string>(word_counts.Keys))
+                {
+                    var count = 0;
+                    word_docs.TryGetValue(w, out count);
+                    word_docs[w] = count + 1;
+                }
+            }
+
+            var wcounts = word_counts.AsEnumerable().ToList();
+            wcounts.Sort((kv1, kv2) => -kv1.Value.CompareTo(kv2.Value));
+
+            var sorted_voc = (oov_token == null) ? new List<string>() : new List<string>() { oov_token };
+            sorted_voc.AddRange(word_counts.Select(kv => kv.Key));
+
+            if (num_words > 0 - 1)
+            {
+                sorted_voc = sorted_voc.Take<string>((oov_token == null) ? num_words : num_words + 1).ToList();
+            }
+
+            word_index = new Dictionary<string, int>(sorted_voc.Count);
+            index_word = new Dictionary<int, string>(sorted_voc.Count);
+            index_docs = new Dictionary<int, int>(word_docs.Count);
+
+            for (int i = 0; i < sorted_voc.Count; i++)
+            {
+                word_index.Add(sorted_voc[i], i + 1);
+                index_word.Add(i + 1, sorted_voc[i]);
+            }
+
+            foreach (var kv in word_docs)
+            {
+                var idx = -1;
+                if (word_index.TryGetValue(kv.Key, out idx))
+                {
+                    index_docs.Add(idx, kv.Value);
+                }
+            }
+        }
+
+        /// <summary>
+        /// Updates internal vocabulary based on a list of sequences.
+        /// </summary>
+        /// <param name="sequences"></param>
+        /// <remarks>Required before using sequences_to_matrix (if fit_on_texts was never called).</remarks>
         public void fit_on_sequences(IEnumerable<int[]> sequences)
         {
             throw new NotImplementedException("fit_on_sequences");
         }
 
+        /// <summary>
+        /// Transforms each string in texts to a sequence of integers.
+        /// </summary>
+        /// <param name="texts"></param>
+        /// <returns></returns>
+        /// <remarks>Only top num_words-1 most frequent words will be taken into account.Only words known by the tokenizer will be taken into account.</remarks>
         public IList<int[]> texts_to_sequences(IEnumerable<string> texts)
         {
             return texts_to_sequences_generator(texts).ToArray();
         }
+
+        /// <summary>
+        /// Transforms each token in texts to a sequence of integers.
+        /// </summary>
+        /// <param name="texts"></param>
+        /// <returns></returns>
+        /// <remarks>Only top num_words-1 most frequent words will be taken into account.Only words known by the tokenizer will be taken into account.</remarks>
+        public IList<int[]> texts_to_sequences(IEnumerable<IList<string>> texts)
+        {
+            return texts_to_sequences_generator(texts).ToArray();
+        }
+
         public IEnumerable<int[]> texts_to_sequences_generator(IEnumerable<string> texts)
         {
             int oov_index = -1;
             var _ = (oov_token != null) && word_index.TryGetValue(oov_token, out oov_index);
 
-            return texts.Select(text => {
-
+            return texts.Select(text =>
+            {
                 IEnumerable<string> seq = null;
 
-                if (char_level) 
+                if (char_level)
                 {
                     throw new NotImplementedException("char_level == true");
                 }
@@ -135,39 +219,101 @@ namespace Tensorflow.Keras.Text
                 {
                     seq = analyzer(lower ? text.ToLower() : text);
                 }
-                var vect = new List<int>();
-                foreach (var w in seq)
+
+                return ConvertToSequence(oov_index, seq).ToArray();
+            });
+        }
+
+        private List<int> ConvertToSequence(int oov_index, IEnumerable<string> seq)
+        {
+            var vect = new List<int>();
+            foreach (var w in seq.Select(s => lower ? s.ToLower() : s))
+            {
+                var i = -1;
+                if (word_index.TryGetValue(w, out i))
+                {
+                    if (num_words != -1 && i >= num_words)
+                    {
+                        if (oov_index != -1)
+                        {
+                            vect.Add(oov_index);
+                        }
+                    }
+                    else
+                    {
+                        vect.Add(i);
+                    }
+                }
+                else if(oov_index != -1)
+                {
+                    vect.Add(oov_index);
+                }
+            }
+
+            return vect;
+        }
+
+        public IEnumerable<int[]> texts_to_sequences_generator(IEnumerable<IList<string>> texts)
+        {
+            int oov_index = -1;
+            var _ = (oov_token != null) && word_index.TryGetValue(oov_token, out oov_index);
+            return texts.Select(seq => ConvertToSequence(oov_index, seq).ToArray());
+        }
+
+        /// <summary>
+        /// Transforms each sequence into a list of text.
+        /// </summary>
+        /// <param name="sequences"></param>
+        /// <returns>A list of texts(strings)</returns>
+        /// <remarks>Only top num_words-1 most frequent words will be taken into account.Only words known by the tokenizer will be taken into account.</remarks>
+        public IList<string> sequences_to_texts(IEnumerable<int[]> sequences)
+        {
+            return sequences_to_texts_generator(sequences).ToArray();
+        }
+
+        public IEnumerable<string> sequences_to_texts_generator(IEnumerable<int[]> sequences)
+        {
+            int oov_index = -1;
+            var _ = (oov_token != null) && word_index.TryGetValue(oov_token, out oov_index);
+
+            return sequences.Select(seq =>
+            {
+
+                var bldr = new StringBuilder();
+                for (var i = 0; i < seq.Length; i++)
                 {
-                    var i = -1;
-                    if (word_index.TryGetValue(w, out i))
+                    if (i > 0) bldr.Append(' ');
+
+                    string word = null;
+                    if (index_word.TryGetValue(seq[i], out word))
                     {
                         if (num_words != -1 && i >= num_words)
                         {
                             if (oov_index != -1)
                             {
-                                vect.Add(oov_index);
+                                bldr.Append(oov_token);
                             }
                         }
                         else
                         {
-                            vect.Add(i);
+                            bldr.Append(word);
                         }
                     }
-                    else
+                    else if (oov_index != -1)
                     {
-                        vect.Add(oov_index);
+                        bldr.Append(oov_token);
                     }
                 }
 
-                return vect.ToArray();
+                return bldr.ToString();
             });
         }
 
-        public IEnumerable<string> sequences_to_texts(IEnumerable<int[]> sequences)
-        {
-            throw new NotImplementedException("sequences_to_texts");
-        }
-
+        /// <summary>
+        /// Converts a list of sequences into a Numpy matrix.
+        /// </summary>
+        /// <param name="sequences"></param>
+        /// <returns></returns>
         public NDArray sequences_to_matrix(IEnumerable<int[]> sequences)
         {
             throw new NotImplementedException("sequences_to_matrix");
diff --git a/test/TensorFlowNET.Keras.UnitTest/PreprocessingTests.cs b/test/TensorFlowNET.Keras.UnitTest/PreprocessingTests.cs
index 0bbb9a8b..ebde87fa 100644
--- a/test/TensorFlowNET.Keras.UnitTest/PreprocessingTests.cs
+++ b/test/TensorFlowNET.Keras.UnitTest/PreprocessingTests.cs
@@ -19,12 +19,27 @@ namespace TensorFlowNET.Keras.UnitTest
                 "It was the best of times, it was the worst of times.",
                 "this is a new dawn, an era to follow the previous era.",
                 };
+
+        private readonly string[][] tokenized_texts = new string[][] {
+                new string[] {"It","was","the","best","of","times","it","was","the","worst","of","times"},
+                new string[] {"this","is","a","new","dawn","an","era","to","follow","the","previous","era","It","can","not","be","said","to","start","anew" },
+                new string[] {"It","was","the","best","of","times","it","was","the","worst","of","times"},
+                new string[] {"this","is","a","new","dawn","an","era","to","follow","the","previous","era" },
+                };
+
+        private readonly string[] processed_texts = new string[] {
+                "it was the best of times it was the worst of times",
+                "this is a new dawn an era to follow the previous era it can not be said to start anew",
+                "it was the best of times it was the worst of times",
+                "this is a new dawn an era to follow the previous era",
+                };
+
         private const string OOV = "<OOV>";
 
         [TestMethod]
         public void TokenizeWithNoOOV()
         {
-            var tokenizer = keras.preprocessing.text.Tokenizer(lower: true);
+            var tokenizer = keras.preprocessing.text.Tokenizer();
             tokenizer.fit_on_texts(texts);
 
             Assert.AreEqual(23, tokenizer.word_index.Count);
@@ -34,10 +49,24 @@ namespace TensorFlowNET.Keras.UnitTest
             Assert.AreEqual(16, tokenizer.word_index["follow"]);
         }
 
+        [TestMethod]
+        public void TokenizeWithNoOOV_Tkn()
+        {
+            var tokenizer = keras.preprocessing.text.Tokenizer();
+            // Use the list version, where the tokenization has already been done.
+            tokenizer.fit_on_texts(tokenized_texts);
+
+            Assert.AreEqual(23, tokenizer.word_index.Count);
+
+            Assert.AreEqual(7, tokenizer.word_index["worst"]);
+            Assert.AreEqual(12, tokenizer.word_index["dawn"]);
+            Assert.AreEqual(16, tokenizer.word_index["follow"]);
+        }
+
         [TestMethod]
         public void TokenizeWithOOV()
         {
-            var tokenizer = keras.preprocessing.text.Tokenizer(lower: true, oov_token: OOV);
+            var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV);
             tokenizer.fit_on_texts(texts);
 
             Assert.AreEqual(24, tokenizer.word_index.Count);
@@ -48,10 +77,161 @@ namespace TensorFlowNET.Keras.UnitTest
             Assert.AreEqual(17, tokenizer.word_index["follow"]);
         }
 
+        [TestMethod]
+        public void TokenizeWithOOV_Tkn()
+        {
+            var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV);
+            // Use the list version, where the tokenization has already been done.
+            tokenizer.fit_on_texts(tokenized_texts);
+
+            Assert.AreEqual(24, tokenizer.word_index.Count);
+
+            Assert.AreEqual(1, tokenizer.word_index[OOV]);
+            Assert.AreEqual(8, tokenizer.word_index["worst"]);
+            Assert.AreEqual(13, tokenizer.word_index["dawn"]);
+            Assert.AreEqual(17, tokenizer.word_index["follow"]);
+        }
+
+        [TestMethod]
+        public void TokenizeTextsToSequences()
+        {
+            var tokenizer = keras.preprocessing.text.Tokenizer();
+            tokenizer.fit_on_texts(texts);
+
+            var sequences = tokenizer.texts_to_sequences(texts);
+            Assert.AreEqual(4, sequences.Count);
+
+            Assert.AreEqual(tokenizer.word_index["worst"], sequences[0][9]);
+            Assert.AreEqual(tokenizer.word_index["previous"], sequences[1][10]);
+        }
+
+        [TestMethod]
+        public void TokenizeTextsToSequences_Tkn()
+        {
+            var tokenizer = keras.preprocessing.text.Tokenizer();
+            // Use the list version, where the tokenization has already been done.
+            tokenizer.fit_on_texts(tokenized_texts);
+
+            var sequences = tokenizer.texts_to_sequences(tokenized_texts);
+            Assert.AreEqual(4, sequences.Count);
+
+            Assert.AreEqual(tokenizer.word_index["worst"], sequences[0][9]);
+            Assert.AreEqual(tokenizer.word_index["previous"], sequences[1][10]);
+        }
+
+        [TestMethod]
+        public void TokenizeTextsToSequencesAndBack()
+        {
+            var tokenizer = keras.preprocessing.text.Tokenizer();
+            tokenizer.fit_on_texts(texts);
+
+            var sequences = tokenizer.texts_to_sequences(texts);
+            Assert.AreEqual(4, sequences.Count);
+
+            var processed = tokenizer.sequences_to_texts(sequences);
+
+            Assert.AreEqual(4, processed.Count);
+
+            for (var i = 0; i < processed.Count; i++)
+                Assert.AreEqual(processed_texts[i], processed[i]);
+        }
+
+        [TestMethod]
+        public void TokenizeTextsToSequencesAndBack_Tkn1()
+        {
+            var tokenizer = keras.preprocessing.text.Tokenizer();
+            // Use the list version, where the tokenization has already been done.
+            tokenizer.fit_on_texts(tokenized_texts);
+
+            // Use the list version, where the tokenization has already been done.
+            var sequences = tokenizer.texts_to_sequences(tokenized_texts);
+            Assert.AreEqual(4, sequences.Count);
+
+            var processed = tokenizer.sequences_to_texts(sequences);
+
+            Assert.AreEqual(4, processed.Count);
+
+            for (var i = 0; i < processed.Count; i++)
+                Assert.AreEqual(processed_texts[i], processed[i]);
+        }
+
+        [TestMethod]
+        public void TokenizeTextsToSequencesAndBack_Tkn2()
+        {
+            var tokenizer = keras.preprocessing.text.Tokenizer();
+            // Use the list version, where the tokenization has already been done.
+            tokenizer.fit_on_texts(tokenized_texts);
+
+            var sequences = tokenizer.texts_to_sequences(texts);
+            Assert.AreEqual(4, sequences.Count);
+
+            var processed = tokenizer.sequences_to_texts(sequences);
+
+            Assert.AreEqual(4, processed.Count);
+
+            for (var i = 0; i < processed.Count; i++)
+                Assert.AreEqual(processed_texts[i], processed[i]);
+        }
+
+        [TestMethod]
+        public void TokenizeTextsToSequencesAndBack_Tkn3()
+        {
+            var tokenizer = keras.preprocessing.text.Tokenizer();
+            tokenizer.fit_on_texts(texts);
+
+            // Use the list version, where the tokenization has already been done.
+            var sequences = tokenizer.texts_to_sequences(tokenized_texts);
+            Assert.AreEqual(4, sequences.Count);
+
+            var processed = tokenizer.sequences_to_texts(sequences);
+
+            Assert.AreEqual(4, processed.Count);
+
+            for (var i = 0; i < processed.Count; i++)
+                Assert.AreEqual(processed_texts[i], processed[i]);
+        }
+        [TestMethod]
+        public void TokenizeTextsToSequencesWithOOV()
+        {
+            var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV);
+            tokenizer.fit_on_texts(texts);
+
+            var sequences = tokenizer.texts_to_sequences(texts);
+            Assert.AreEqual(4, sequences.Count);
+
+            Assert.AreEqual(tokenizer.word_index["worst"], sequences[0][9]);
+            Assert.AreEqual(tokenizer.word_index["previous"], sequences[1][10]);
+
+            for (var i = 0; i < sequences.Count; i++)
+                for (var j = 0; j < sequences[i].Length; j++)
+                Assert.AreNotEqual(tokenizer.word_index[OOV], sequences[i][j]);
+        }
+
+        [TestMethod]
+        public void TokenizeTextsToSequencesWithOOVPresent()
+        {
+            var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV, num_words:20);
+            tokenizer.fit_on_texts(texts);
+
+            var sequences = tokenizer.texts_to_sequences(texts);
+            Assert.AreEqual(4, sequences.Count);
+
+            Assert.AreEqual(tokenizer.word_index["worst"], sequences[0][9]);
+            Assert.AreEqual(tokenizer.word_index["previous"], sequences[1][10]);
+
+            var oov_count = 0;
+            for (var i = 0; i < sequences.Count; i++)
+                for (var j = 0; j < sequences[i].Length; j++)
+                    if (tokenizer.word_index[OOV] == sequences[i][j])
+                        oov_count += 1;
+
+            Assert.AreEqual(5, oov_count);
+        }
+
         [TestMethod]
         public void PadSequencesWithDefaults()
         {
-            var tokenizer = keras.preprocessing.text.Tokenizer(lower: true, oov_token: OOV);
+            var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV);
             tokenizer.fit_on_texts(texts);
 
             var sequences = tokenizer.texts_to_sequences(texts);
@@ -74,7 +254,7 @@ namespace TensorFlowNET.Keras.UnitTest
         [TestMethod]
         public void PadSequencesPrePaddingTrunc()
         {
-            var tokenizer = keras.preprocessing.text.Tokenizer(lower: true, oov_token: OOV);
+            var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV);
             tokenizer.fit_on_texts(texts);
 
             var sequences = tokenizer.texts_to_sequences(texts);
@@ -97,7 +277,7 @@ namespace TensorFlowNET.Keras.UnitTest
         [TestMethod]
         public void PadSequencesPostPaddingTrunc()
         {
-            var tokenizer = keras.preprocessing.text.Tokenizer(lower: true, oov_token: OOV);
+            var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV);
             tokenizer.fit_on_texts(texts);
 
             var sequences = tokenizer.texts_to_sequences(texts);