From e1b1fafb0a1a11e10b5cfb7d806a60589caa5ea6 Mon Sep 17 00:00:00 2001
From: Niklas Gustafsson <niklasg@microsoft.com>
Date: Fri, 19 Feb 2021 09:06:58 -0800
Subject: [PATCH] Adding subset of text preprocessing Keras APis and unit
 tests.

---
 .../Preprocessings/Preprocessing.cs           |   4 +
 .../Preprocessings/Tokenizer.cs               | 444 ++++++++++++++++++
 src/TensorFlowNET.Keras/Sequence.cs           |  29 +-
 src/TensorFlowNET.Keras/TextApi.cs            |  35 ++
 .../PreprocessingTests.cs                     | 413 ++++++++++++++++
 5 files changed, 917 insertions(+), 8 deletions(-)
 create mode 100644 src/TensorFlowNET.Keras/Preprocessings/Tokenizer.cs
 create mode 100644 src/TensorFlowNET.Keras/TextApi.cs
 create mode 100644 test/TensorFlowNET.Keras.UnitTest/PreprocessingTests.cs
diff --git a/src/TensorFlowNET.Keras/Preprocessings/Preprocessing.cs b/src/TensorFlowNET.Keras/Preprocessings/Preprocessing.cs
index 34aeb211..994a36d6 100644
--- a/src/TensorFlowNET.Keras/Preprocessings/Preprocessing.cs
+++ b/src/TensorFlowNET.Keras/Preprocessings/Preprocessing.cs
@@ -10,6 +10,10 @@ namespace Tensorflow.Keras
         public Sequence sequence => new Sequence();
         public DatasetUtils dataset_utils => new DatasetUtils();
 
+        public TextApi text => _text;
+
+        private static TextApi _text = new TextApi();
+
         public TextVectorization TextVectorization(Func<Tensor, Tensor> standardize = null,
             string split = "whitespace",
             int max_tokens = -1,
diff --git a/src/TensorFlowNET.Keras/Preprocessings/Tokenizer.cs b/src/TensorFlowNET.Keras/Preprocessings/Tokenizer.cs
new file mode 100644
index 00000000..29cbec8e
--- /dev/null
+++ b/src/TensorFlowNET.Keras/Preprocessings/Tokenizer.cs
@@ -0,0 +1,444 @@
+﻿using NumSharp;
+using Serilog.Debugging;
+using System;
+using System.Collections.Generic;
+using System.Collections.Specialized;
+using System.Data.SqlTypes;
+using System.Linq;
+using System.Net.Sockets;
+using System.Text;
+
+namespace Tensorflow.Keras.Text
+{
+    /// <summary>
+    /// Text tokenization API.
+    /// This class allows to vectorize a text corpus, by turning each text into either a sequence of integers 
+    /// (each integer being the index of a token in a dictionary) or into a vector where the coefficient for 
+    /// each token could be binary, based on word count, based on tf-idf...
+    /// </summary>
+    /// <remarks>
+    /// This code is a fairly straight port of the Python code for Keras text preprocessing found at:
+    /// https://github.com/keras-team/keras-preprocessing/blob/master/keras_preprocessing/text.py
+    /// </remarks>
+    public class Tokenizer
+    {
+        private readonly int num_words;
+        private readonly string filters;
+        private readonly bool lower;
+        private readonly char split;
+        private readonly bool char_level;
+        private readonly string oov_token;
+        private readonly Func<string, IEnumerable<string>> analyzer;
+
+        private int document_count = 0;
+
+        private Dictionary<string, int> word_docs = new Dictionary<string, int>();
+        private Dictionary<string, int> word_counts = new Dictionary<string, int>();
+
+        public Dictionary<string, int> word_index = null;
+        public Dictionary<int, string> index_word = null;
+
+        private Dictionary<int, int> index_docs = null;
+
+        public Tokenizer(
+            int num_words = -1,
+            string filters = "!\"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n",
+            bool lower = true,
+            char split = ' ',
+            bool char_level = false,
+            string oov_token = null,
+            Func<string, IEnumerable<string>> analyzer = null)
+        {
+            this.num_words = num_words;
+            this.filters = filters;
+            this.lower = lower;
+            this.split = split;
+            this.char_level = char_level;
+            this.oov_token = oov_token;
+            this.analyzer = analyzer != null ? analyzer : (text) => TextApi.text_to_word_sequence(text, filters, lower, split);
+        }
+
+        /// <summary>
+        /// Updates internal vocabulary based on a list of texts. 
+        /// </summary>
+        /// <param name="texts">A list of strings, each containing one or more tokens.</param>
+        /// <remarks>Required before using texts_to_sequences or texts_to_matrix.</remarks>
+        public void fit_on_texts(IEnumerable<string> texts)
+        {
+            foreach (var text in texts)
+            {
+                IEnumerable<string> seq = null;
+
+                document_count += 1;
+                if (char_level)
+                {
+                    throw new NotImplementedException("char_level == true");
+                }
+                else
+                {
+                    seq = analyzer(lower ? text.ToLower() : text);
+                }
+
+                foreach (var w in seq)
+                {
+                    var count = 0;
+                    word_counts.TryGetValue(w, out count);
+                    word_counts[w] = count + 1;
+                }
+
+                foreach (var w in new HashSet<string>(seq))
+                {
+                    var count = 0;
+                    word_docs.TryGetValue(w, out count);
+                    word_docs[w] = count + 1;
+                }
+            }
+
+            var wcounts = word_counts.AsEnumerable().ToList();
+            wcounts.Sort((kv1, kv2) => -kv1.Value.CompareTo(kv2.Value));    // Note: '-' gives us descending order.
+
+            var sorted_voc = (oov_token == null) ? new List<string>() : new List<string>() { oov_token };
+            sorted_voc.AddRange(word_counts.Select(kv => kv.Key));
+
+            if (num_words > 0 - 1)
+            {
+                sorted_voc = sorted_voc.Take<string>((oov_token == null) ? num_words : num_words + 1).ToList();
+            }
+
+            word_index = new Dictionary<string, int>(sorted_voc.Count);
+            index_word = new Dictionary<int, string>(sorted_voc.Count);
+            index_docs = new Dictionary<int, int>(word_docs.Count);
+
+            for (int i = 0; i < sorted_voc.Count; i++)
+            {
+                word_index.Add(sorted_voc[i], i + 1);
+                index_word.Add(i + 1, sorted_voc[i]);
+            }
+
+            foreach (var kv in word_docs)
+            {
+                var idx = -1;
+                if (word_index.TryGetValue(kv.Key, out idx))
+                {
+                    index_docs.Add(idx, kv.Value);
+                }
+            }
+        }
+
+        /// <summary>
+        /// Updates internal vocabulary based on a list of texts. 
+        /// </summary>
+        /// <param name="texts">A list of list of strings, each containing one token.</param>
+        /// <remarks>Required before using texts_to_sequences or texts_to_matrix.</remarks>
+        public void fit_on_texts(IEnumerable<IEnumerable<string>> texts)
+        {
+            foreach (var seq in texts)
+            {
+                foreach (var w in seq.Select(s => lower ? s.ToLower() : s))
+                {
+                    var count = 0;
+                    word_counts.TryGetValue(w, out count);
+                    word_counts[w] = count + 1;
+                }
+
+                foreach (var w in new HashSet<string>(word_counts.Keys))
+                {
+                    var count = 0;
+                    word_docs.TryGetValue(w, out count);
+                    word_docs[w] = count + 1;
+                }
+            }
+
+            var wcounts = word_counts.AsEnumerable().ToList();
+            wcounts.Sort((kv1, kv2) => -kv1.Value.CompareTo(kv2.Value));
+
+            var sorted_voc = (oov_token == null) ? new List<string>() : new List<string>() { oov_token };
+            sorted_voc.AddRange(word_counts.Select(kv => kv.Key));
+
+            if (num_words > 0 - 1)
+            {
+                sorted_voc = sorted_voc.Take<string>((oov_token == null) ? num_words : num_words + 1).ToList();
+            }
+
+            word_index = new Dictionary<string, int>(sorted_voc.Count);
+            index_word = new Dictionary<int, string>(sorted_voc.Count);
+            index_docs = new Dictionary<int, int>(word_docs.Count);
+
+            for (int i = 0; i < sorted_voc.Count; i++)
+            {
+                word_index.Add(sorted_voc[i], i + 1);
+                index_word.Add(i + 1, sorted_voc[i]);
+            }
+
+            foreach (var kv in word_docs)
+            {
+                var idx = -1;
+                if (word_index.TryGetValue(kv.Key, out idx))
+                {
+                    index_docs.Add(idx, kv.Value);
+                }
+            }
+        }
+
+        /// <summary>
+        /// Updates internal vocabulary based on a list of sequences.
+        /// </summary>
+        /// <param name="sequences"></param>
+        /// <remarks>Required before using sequences_to_matrix (if fit_on_texts was never called).</remarks>
+        public void fit_on_sequences(IEnumerable<int[]> sequences)
+        {
+            throw new NotImplementedException("fit_on_sequences");
+        }
+
+        /// <summary>
+        /// Transforms each string in texts to a sequence of integers.
+        /// </summary>
+        /// <param name="texts"></param>
+        /// <returns></returns>
+        /// <remarks>Only top num_words-1 most frequent words will be taken into account.Only words known by the tokenizer will be taken into account.</remarks>
+        public IList<int[]> texts_to_sequences(IEnumerable<string> texts)
+        {
+            return texts_to_sequences_generator(texts).ToArray();
+        }
+
+        /// <summary>
+        /// Transforms each token in texts to a sequence of integers.
+        /// </summary>
+        /// <param name="texts"></param>
+        /// <returns></returns>
+        /// <remarks>Only top num_words-1 most frequent words will be taken into account.Only words known by the tokenizer will be taken into account.</remarks>
+        public IList<int[]> texts_to_sequences(IEnumerable<IEnumerable<string>> texts)
+        {
+            return texts_to_sequences_generator(texts).ToArray();
+        }
+
+        public IEnumerable<int[]> texts_to_sequences_generator(IEnumerable<string> texts)
+        {
+            int oov_index = -1;
+            var _ = (oov_token != null) && word_index.TryGetValue(oov_token, out oov_index);
+
+            return texts.Select(text =>
+            {
+                IEnumerable<string> seq = null;
+
+                if (char_level)
+                {
+                    throw new NotImplementedException("char_level == true");
+                }
+                else
+                {
+                    seq = analyzer(lower ? text.ToLower() : text);
+                }
+
+                return ConvertToSequence(oov_index, seq).ToArray();
+            });
+        }
+
+        public IEnumerable<int[]> texts_to_sequences_generator(IEnumerable<IEnumerable<string>> texts)
+        {
+            int oov_index = -1;
+            var _ = (oov_token != null) && word_index.TryGetValue(oov_token, out oov_index);
+            return texts.Select(seq => ConvertToSequence(oov_index, seq).ToArray());
+        }
+
+        private List<int> ConvertToSequence(int oov_index, IEnumerable<string> seq)
+        {
+            var vect = new List<int>();
+            foreach (var w in seq.Select(s => lower ? s.ToLower() : s))
+            {
+                var i = -1;
+                if (word_index.TryGetValue(w, out i))
+                {
+                    if (num_words != -1 && i >= num_words)
+                    {
+                        if (oov_index != -1)
+                        {
+                            vect.Add(oov_index);
+                        }
+                    }
+                    else
+                    {
+                        vect.Add(i);
+                    }
+                }
+                else if (oov_index != -1)
+                {
+                    vect.Add(oov_index);
+                }
+            }
+
+            return vect;
+        }
+
+        /// <summary>
+        /// Transforms each sequence into a list of text.
+        /// </summary>
+        /// <param name="sequences"></param>
+        /// <returns>A list of texts(strings)</returns>
+        /// <remarks>Only top num_words-1 most frequent words will be taken into account.Only words known by the tokenizer will be taken into account.</remarks>
+        public IList<string> sequences_to_texts(IEnumerable<int[]> sequences)
+        {
+            return sequences_to_texts_generator(sequences).ToArray();
+        }
+
+        public IEnumerable<string> sequences_to_texts_generator(IEnumerable<IList<int>> sequences)
+        {
+            int oov_index = -1;
+            var _ = (oov_token != null) && word_index.TryGetValue(oov_token, out oov_index);
+
+            return sequences.Select(seq =>
+            {
+
+                var bldr = new StringBuilder();
+                for (var i = 0; i < seq.Count; i++)
+                {
+                    if (i > 0) bldr.Append(' ');
+
+                    string word = null;
+                    if (index_word.TryGetValue(seq[i], out word))
+                    {
+                        if (num_words != -1 && i >= num_words)
+                        {
+                            if (oov_index != -1)
+                            {
+                                bldr.Append(oov_token);
+                            }
+                        }
+                        else
+                        {
+                            bldr.Append(word);
+                        }
+                    }
+                    else if (oov_index != -1)
+                    {
+                        bldr.Append(oov_token);
+                    }
+                }
+
+                return bldr.ToString();
+            });
+        }
+
+        /// <summary>
+        /// Convert a list of texts to a Numpy matrix.
+        /// </summary>
+        /// <param name="texts">A sequence of strings containing one or more tokens.</param>
+        /// <param name="mode">One of "binary", "count", "tfidf", "freq".</param>
+        /// <returns></returns>
+        public NDArray texts_to_matrix(IEnumerable<string> texts, string mode = "binary")
+        {
+            return sequences_to_matrix(texts_to_sequences(texts), mode);
+        }
+
+        /// <summary>
+        /// Convert a list of texts to a Numpy matrix.
+        /// </summary>
+        /// <param name="texts">A sequence of lists of strings, each containing one token.</param>
+        /// <param name="mode">One of "binary", "count", "tfidf", "freq".</param>
+        /// <returns></returns>
+        public NDArray texts_to_matrix(IEnumerable<IList<string>> texts, string mode = "binary")
+        {
+            return sequences_to_matrix(texts_to_sequences(texts), mode);
+        }
+
+        /// <summary>
+        /// Converts a list of sequences into a Numpy matrix.
+        /// </summary>
+        /// <param name="sequences">A sequence of lists of integers, encoding tokens.</param>
+        /// <param name="mode">One of "binary", "count", "tfidf", "freq".</param>
+        /// <returns></returns>
+        public NDArray sequences_to_matrix(IEnumerable<IList<int>> sequences, string mode = "binary")
+        {
+            if (!modes.Contains(mode)) throw new InvalidArgumentError($"Unknown vectorization mode: {mode}");
+            var word_count = 0;
+
+            if (num_words == -1)
+            {
+                if (word_index != null)
+                {
+                    word_count = word_index.Count + 1;
+                }
+                else
+                {
+                    throw new InvalidOperationException("Specifya dimension ('num_words' arugment), or fit on some text data first.");
+                }
+            }
+            else
+            {
+                word_count = num_words;
+            }
+
+            if (mode == "tfidf" && this.document_count == 0)
+            {
+                throw new InvalidOperationException("Fit the Tokenizer on some text data before using the 'tfidf' mode.");
+            }
+
+            var x = np.zeros(sequences.Count(), word_count);
+
+            for (int i = 0; i < sequences.Count(); i++)
+            {
+                var seq = sequences.ElementAt(i);
+                if (seq == null || seq.Count == 0)
+                    continue;
+
+                var counts = new Dictionary<int, int>();
+
+                var seq_length = seq.Count;
+
+                foreach (var j in seq)
+                {
+                    if (j >= word_count)
+                        continue;
+                    var count = 0;
+                    counts.TryGetValue(j, out count);
+                    counts[j] = count + 1;
+                }
+
+                if (mode == "count")
+                {
+                    foreach (var kv in counts)
+                    {
+                        var j = kv.Key;
+                        var c = kv.Value;
+                        x[i, j] = c;
+                    }
+                }
+                else if (mode == "freq")
+                {
+                    foreach (var kv in counts)
+                    {
+                        var j = kv.Key;
+                        var c = kv.Value;
+                        x[i, j] = ((double)c) / seq_length;
+                    }
+                }
+                else if (mode == "binary")
+                {
+                    foreach (var kv in counts)
+                    {
+                        var j = kv.Key;
+                        var c = kv.Value;
+                        x[i, j] = 1;
+                    }
+                }
+                else if (mode == "tfidf")
+                {
+                    foreach (var kv in counts)
+                    {
+                        var j = kv.Key;
+                        var c = kv.Value;
+                        var id = 0;
+                        var _ = index_docs.TryGetValue(j, out id);
+                        var tf = 1 + np.log(c);
+                        var idf = np.log(1 + document_count / (1 + id));
+                        x[i, j] = tf * idf;
+                    }
+                }
+            }
+
+            return x;
+        }
+
+        private string[] modes = new string[] { "binary", "count", "tfidf", "freq" };
+    }
+}
diff --git a/src/TensorFlowNET.Keras/Sequence.cs b/src/TensorFlowNET.Keras/Sequence.cs
index a428a568..9f503aee 100644
--- a/src/TensorFlowNET.Keras/Sequence.cs
+++ b/src/TensorFlowNET.Keras/Sequence.cs
@@ -15,7 +15,9 @@
 ******************************************************************************/
 
 using NumSharp;
+using NumSharp.Utilities;
 using System;
+using System.Collections.Generic;
 using System.Linq;
 
 namespace Tensorflow.Keras
@@ -34,14 +36,18 @@ namespace Tensorflow.Keras
         /// <param name="truncating">String, 'pre' or 'post'</param>
         /// <param name="value">Float or String, padding value.</param>
         /// <returns></returns>
-        public NDArray pad_sequences(NDArray sequences,
+        public NDArray pad_sequences(IEnumerable<int[]> sequences,
             int? maxlen = null,
             string dtype = "int32",
             string padding = "pre",
             string truncating = "pre",
             object value = null)
         {
-            int[] length = new int[sequences.size];
+            if (value != null) throw new NotImplementedException("padding with a specific value.");
+            if (padding != "pre" && padding != "post") throw new InvalidArgumentError("padding must be 'pre' or 'post'.");
+            if (truncating != "pre" && truncating != "post") throw new InvalidArgumentError("truncating must be 'pre' or 'post'.");
+
+            var length = sequences.Select(s => s.Length);
 
             if (maxlen == null)
                 maxlen = length.Max();
@@ -49,19 +55,26 @@ namespace Tensorflow.Keras
             if (value == null)
                 value = 0f;
 
-            var nd = new NDArray(np.int32, new Shape(sequences.size, maxlen.Value));
-#pragma warning disable CS0162 // Unreachable code detected
+            var type = getNPType(dtype);
+            var nd = new NDArray(type, new Shape(length.Count(), maxlen.Value), true);
+
             for (int i = 0; i < nd.shape[0]; i++)
-#pragma warning restore CS0162 // Unreachable code detected
             {
-                switch (sequences[i])
+                var s = sequences.ElementAt(i);
+                if (s.Length > maxlen.Value)
                 {
-                    default:
-                        throw new NotImplementedException("pad_sequences");
+                    s = (truncating == "pre") ? s.Slice(s.Length - maxlen.Value, s.Length) : s.Slice(0, maxlen.Value);
                 }
+                var sliceString = (padding == "pre") ? $"{i},{maxlen - s.Length}:" : $"{i},:{s.Length}";
+                nd[sliceString] = np.array(s);
             }
 
             return nd;
         }
+
+        private Type getNPType(string typeName)
+        {
+            return System.Type.GetType("NumSharp.np,NumSharp").GetField(typeName).GetValue(null) as Type;
+        }
     }
 }
diff --git a/src/TensorFlowNET.Keras/TextApi.cs b/src/TensorFlowNET.Keras/TextApi.cs
new file mode 100644
index 00000000..8ce8d685
--- /dev/null
+++ b/src/TensorFlowNET.Keras/TextApi.cs
@@ -0,0 +1,35 @@
+﻿using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using Tensorflow.Keras.Text;
+
+namespace Tensorflow.Keras
+{
+    public class TextApi
+    {
+        public Tensorflow.Keras.Text.Tokenizer Tokenizer(
+                int num_words = -1,
+                string filters = DefaultFilter,
+                bool lower = true,
+                char split = ' ',
+                bool char_level = false,
+                string oov_token = null,
+                Func<string, IEnumerable<string>> analyzer = null)
+        {
+            return new Keras.Text.Tokenizer(num_words, filters, lower, split, char_level, oov_token, analyzer);
+        }
+
+        public static IEnumerable<string> text_to_word_sequence(string text, string filters = DefaultFilter, bool lower = true, char split = ' ')
+        {
+            if (lower)
+            {
+                text = text.ToLower();
+            }
+            var newText = new String(text.Where(c => !filters.Contains(c)).ToArray());
+            return newText.Split(split);
+        }
+
+        private const string DefaultFilter = "!\"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n";
+    }
+}
diff --git a/test/TensorFlowNET.Keras.UnitTest/PreprocessingTests.cs b/test/TensorFlowNET.Keras.UnitTest/PreprocessingTests.cs
new file mode 100644
index 00000000..10340063
--- /dev/null
+++ b/test/TensorFlowNET.Keras.UnitTest/PreprocessingTests.cs
@@ -0,0 +1,413 @@
+﻿using Microsoft.VisualStudio.TestTools.UnitTesting;
+using System;
+using System.Linq;
+using System.Collections.Generic;
+using System.Text;
+using NumSharp;
+using static Tensorflow.KerasApi;
+using Tensorflow;
+using Tensorflow.Keras.Datasets;
+using Microsoft.Extensions.DependencyInjection;
+
+namespace TensorFlowNET.Keras.UnitTest
+{
+    [TestClass]
+    public class PreprocessingTests : EagerModeTestBase
+    {
+        private readonly string[] texts = new string[] {
+                "It was the best of times, it was the worst of times.",
+                "Mr and Mrs Dursley of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much.",
+                "It was the best of times, it was the worst of times.",
+                "Mr and Mrs Dursley of number four, Privet Drive.",
+                };
+
+        private readonly string[][] tokenized_texts = new string[][] {
+                new string[] {"It","was","the","best","of","times","it","was","the","worst","of","times"},
+                new string[] {"mr","and","mrs","dursley","of","number","four","privet","drive","were","proud","to","say","that","they","were","perfectly","normal","thank","you","very","much"},
+                new string[] {"It","was","the","best","of","times","it","was","the","worst","of","times"},
+                new string[] {"mr","and","mrs","dursley","of","number","four","privet","drive"},
+                };
+
+        private readonly string[] processed_texts = new string[] {
+                "it was the best of times it was the worst of times",
+                "mr and mrs dursley of number four privet drive were proud to say that they were perfectly normal thank you very much",
+                "it was the best of times it was the worst of times",
+                "mr and mrs dursley of number four privet drive",
+                };
+
+        private const string OOV = "<OOV>";
+
+        [TestMethod]
+        public void TokenizeWithNoOOV()
+        {
+            var tokenizer = keras.preprocessing.text.Tokenizer();
+            tokenizer.fit_on_texts(texts);
+
+            Assert.AreEqual(27, tokenizer.word_index.Count);
+
+            Assert.AreEqual(7, tokenizer.word_index["worst"]);
+            Assert.AreEqual(12, tokenizer.word_index["number"]);
+            Assert.AreEqual(16, tokenizer.word_index["were"]);
+        }
+
+        [TestMethod]
+        public void TokenizeWithNoOOV_Tkn()
+        {
+            var tokenizer = keras.preprocessing.text.Tokenizer();
+            // Use the list version, where the tokenization has already been done.
+            tokenizer.fit_on_texts(tokenized_texts);
+
+            Assert.AreEqual(27, tokenizer.word_index.Count);
+
+            Assert.AreEqual(7, tokenizer.word_index["worst"]);
+            Assert.AreEqual(12, tokenizer.word_index["number"]);
+            Assert.AreEqual(16, tokenizer.word_index["were"]);
+        }
+
+        [TestMethod]
+        public void TokenizeWithOOV()
+        {
+            var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV);
+            tokenizer.fit_on_texts(texts);
+
+            Assert.AreEqual(28, tokenizer.word_index.Count);
+
+            Assert.AreEqual(1,  tokenizer.word_index[OOV]);
+            Assert.AreEqual(8,  tokenizer.word_index["worst"]);
+            Assert.AreEqual(13, tokenizer.word_index["number"]);
+            Assert.AreEqual(17, tokenizer.word_index["were"]);
+        }
+
+        [TestMethod]
+        public void TokenizeWithOOV_Tkn()
+        {
+            var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV);
+            // Use the list version, where the tokenization has already been done.
+            tokenizer.fit_on_texts(tokenized_texts);
+
+            Assert.AreEqual(28, tokenizer.word_index.Count);
+
+            Assert.AreEqual(1, tokenizer.word_index[OOV]);
+            Assert.AreEqual(8, tokenizer.word_index["worst"]);
+            Assert.AreEqual(13, tokenizer.word_index["number"]);
+            Assert.AreEqual(17, tokenizer.word_index["were"]);
+        }
+
+        [TestMethod]
+        public void TokenizeTextsToSequences()
+        {
+            var tokenizer = keras.preprocessing.text.Tokenizer();
+            tokenizer.fit_on_texts(texts);
+
+            var sequences = tokenizer.texts_to_sequences(texts);
+            Assert.AreEqual(4, sequences.Count);
+
+            Assert.AreEqual(tokenizer.word_index["worst"], sequences[0][9]);
+            Assert.AreEqual(tokenizer.word_index["proud"], sequences[1][10]);
+        }
+
+        [TestMethod]
+        public void TokenizeTextsToSequences_Tkn()
+        {
+            var tokenizer = keras.preprocessing.text.Tokenizer();
+            // Use the list version, where the tokenization has already been done.
+            tokenizer.fit_on_texts(tokenized_texts);
+
+            var sequences = tokenizer.texts_to_sequences(tokenized_texts);
+            Assert.AreEqual(4, sequences.Count);
+
+            Assert.AreEqual(tokenizer.word_index["worst"], sequences[0][9]);
+            Assert.AreEqual(tokenizer.word_index["proud"], sequences[1][10]);
+        }
+
+        [TestMethod]
+        public void TokenizeTextsToSequencesAndBack()
+        {
+            var tokenizer = keras.preprocessing.text.Tokenizer();
+            tokenizer.fit_on_texts(texts);
+
+            var sequences = tokenizer.texts_to_sequences(texts);
+            Assert.AreEqual(4, sequences.Count);
+
+            var processed = tokenizer.sequences_to_texts(sequences);
+
+            Assert.AreEqual(4, processed.Count);
+
+            for (var i = 0; i < processed.Count; i++)
+                Assert.AreEqual(processed_texts[i], processed[i]);
+        }
+
+        [TestMethod]
+        public void TokenizeTextsToSequencesAndBack_Tkn1()
+        {
+            var tokenizer = keras.preprocessing.text.Tokenizer();
+            // Use the list version, where the tokenization has already been done.
+            tokenizer.fit_on_texts(tokenized_texts);
+
+            // Use the list version, where the tokenization has already been done.
+            var sequences = tokenizer.texts_to_sequences(tokenized_texts);
+            Assert.AreEqual(4, sequences.Count);
+
+            var processed = tokenizer.sequences_to_texts(sequences);
+
+            Assert.AreEqual(4, processed.Count);
+
+            for (var i = 0; i < processed.Count; i++)
+                Assert.AreEqual(processed_texts[i], processed[i]);
+        }
+
+        [TestMethod]
+        public void TokenizeTextsToSequencesAndBack_Tkn2()
+        {
+            var tokenizer = keras.preprocessing.text.Tokenizer();
+            // Use the list version, where the tokenization has already been done.
+            tokenizer.fit_on_texts(tokenized_texts);
+
+            var sequences = tokenizer.texts_to_sequences(texts);
+            Assert.AreEqual(4, sequences.Count);
+
+            var processed = tokenizer.sequences_to_texts(sequences);
+
+            Assert.AreEqual(4, processed.Count);
+
+            for (var i = 0; i < processed.Count; i++)
+                Assert.AreEqual(processed_texts[i], processed[i]);
+        }
+
+        [TestMethod]
+        public void TokenizeTextsToSequencesAndBack_Tkn3()
+        {
+            var tokenizer = keras.preprocessing.text.Tokenizer();
+            tokenizer.fit_on_texts(texts);
+
+            // Use the list version, where the tokenization has already been done.
+            var sequences = tokenizer.texts_to_sequences(tokenized_texts);
+            Assert.AreEqual(4, sequences.Count);
+
+            var processed = tokenizer.sequences_to_texts(sequences);
+
+            Assert.AreEqual(4, processed.Count);
+
+            for (var i = 0; i < processed.Count; i++)
+                Assert.AreEqual(processed_texts[i], processed[i]);
+        }
+        [TestMethod]
+        public void TokenizeTextsToSequencesWithOOV()
+        {
+            var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV);
+            tokenizer.fit_on_texts(texts);
+
+            var sequences = tokenizer.texts_to_sequences(texts);
+            Assert.AreEqual(4, sequences.Count);
+
+            Assert.AreEqual(tokenizer.word_index["worst"], sequences[0][9]);
+            Assert.AreEqual(tokenizer.word_index["proud"], sequences[1][10]);
+
+            for (var i = 0; i < sequences.Count; i++)
+                for (var j = 0; j < sequences[i].Length; j++)
+                Assert.AreNotEqual(tokenizer.word_index[OOV], sequences[i][j]);
+        }
+
+        [TestMethod]
+        public void TokenizeTextsToSequencesWithOOVPresent()
+        {
+            var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV, num_words:20);
+            tokenizer.fit_on_texts(texts);
+
+            var sequences = tokenizer.texts_to_sequences(texts);
+            Assert.AreEqual(4, sequences.Count);
+
+            Assert.AreEqual(tokenizer.word_index["worst"], sequences[0][9]);
+            Assert.AreEqual(tokenizer.word_index["proud"], sequences[1][10]);
+
+            var oov_count = 0;
+            for (var i = 0; i < sequences.Count; i++)
+                for (var j = 0; j < sequences[i].Length; j++)
+                    if (tokenizer.word_index[OOV] == sequences[i][j])
+                        oov_count += 1;
+
+            Assert.AreEqual(9, oov_count);
+        }
+
+        [TestMethod]
+        public void PadSequencesWithDefaults()
+        {
+            var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV);
+            tokenizer.fit_on_texts(texts);
+
+            var sequences = tokenizer.texts_to_sequences(texts);
+            var padded = keras.preprocessing.sequence.pad_sequences(sequences);
+
+            Assert.AreEqual(4, padded.shape[0]);
+            Assert.AreEqual(22, padded.shape[1]);
+
+            Assert.AreEqual(tokenizer.word_index["worst"], padded[0, 19].GetInt32());
+            for (var i = 0; i < 8; i++)
+                Assert.AreEqual(0, padded[0, i].GetInt32());
+            Assert.AreEqual(tokenizer.word_index["proud"], padded[1, 10].GetInt32());
+            for (var i = 0; i < 20; i++)
+                Assert.AreNotEqual(0, padded[1, i].GetInt32());
+        }
+
+        [TestMethod]
+        public void PadSequencesPrePaddingTrunc()
+        {
+            var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV);
+            tokenizer.fit_on_texts(texts);
+
+            var sequences = tokenizer.texts_to_sequences(texts);
+            var padded = keras.preprocessing.sequence.pad_sequences(sequences,maxlen:15);
+
+            Assert.AreEqual(4, padded.shape[0]);
+            Assert.AreEqual(15, padded.shape[1]);
+
+            Assert.AreEqual(tokenizer.word_index["worst"], padded[0, 12].GetInt32());
+            for (var i = 0; i < 3; i++)
+                Assert.AreEqual(0, padded[0, i].GetInt32());
+            Assert.AreEqual(tokenizer.word_index["proud"], padded[1, 3].GetInt32());
+            for (var i = 0; i < 15; i++)
+                Assert.AreNotEqual(0, padded[1, i].GetInt32());
+        }
+
+        [TestMethod]
+        public void PadSequencesPrePaddingTrunc_Larger()
+        {
+            var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV);
+            tokenizer.fit_on_texts(texts);
+
+            var sequences = tokenizer.texts_to_sequences(texts);
+            var padded = keras.preprocessing.sequence.pad_sequences(sequences, maxlen: 45);
+
+            Assert.AreEqual(4, padded.shape[0]);
+            Assert.AreEqual(45, padded.shape[1]);
+
+            Assert.AreEqual(tokenizer.word_index["worst"], padded[0, 42].GetInt32());
+            for (var i = 0; i < 33; i++)
+                Assert.AreEqual(0, padded[0, i].GetInt32());
+            Assert.AreEqual(tokenizer.word_index["proud"], padded[1, 33].GetInt32());
+        }
+
+        [TestMethod]
+        public void PadSequencesPostPaddingTrunc()
+        {
+            var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV);
+            tokenizer.fit_on_texts(texts);
+
+            var sequences = tokenizer.texts_to_sequences(texts);
+            var padded = keras.preprocessing.sequence.pad_sequences(sequences, maxlen: 15, padding: "post", truncating: "post");
+
+            Assert.AreEqual(4, padded.shape[0]);
+            Assert.AreEqual(15, padded.shape[1]);
+
+            Assert.AreEqual(tokenizer.word_index["worst"], padded[0, 9].GetInt32());
+            for (var i = 12; i < 15; i++)
+                Assert.AreEqual(0, padded[0, i].GetInt32());
+            Assert.AreEqual(tokenizer.word_index["proud"], padded[1, 10].GetInt32());
+            for (var i = 0; i < 15; i++)
+                Assert.AreNotEqual(0, padded[1, i].GetInt32());
+        }
+
+        [TestMethod]
+        public void PadSequencesPostPaddingTrunc_Larger()
+        {
+            var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV);
+            tokenizer.fit_on_texts(texts);
+
+            var sequences = tokenizer.texts_to_sequences(texts);
+            var padded = keras.preprocessing.sequence.pad_sequences(sequences, maxlen: 45, padding: "post", truncating: "post");
+
+            Assert.AreEqual(4, padded.shape[0]);
+            Assert.AreEqual(45, padded.shape[1]);
+
+            Assert.AreEqual(tokenizer.word_index["worst"], padded[0, 9].GetInt32());
+            for (var i = 32; i < 45; i++)
+                Assert.AreEqual(0, padded[0, i].GetInt32());
+            Assert.AreEqual(tokenizer.word_index["proud"], padded[1, 10].GetInt32());
+        }
+
+        [TestMethod]
+        public void TextToMatrixBinary()
+        {
+            var tokenizer = keras.preprocessing.text.Tokenizer();
+            tokenizer.fit_on_texts(texts);
+
+            Assert.AreEqual(27, tokenizer.word_index.Count);
+
+            var matrix = tokenizer.texts_to_matrix(texts);
+
+            Assert.AreEqual(texts.Length, matrix.shape[0]);
+
+            CompareLists(new double[] { 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, matrix[0].ToArray<double>());
+            CompareLists(new double[] { 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, matrix[1].ToArray<double>());
+        }
+
+        [TestMethod]
+        public void TextToMatrixCount()
+        {
+            var tokenizer = keras.preprocessing.text.Tokenizer();
+            tokenizer.fit_on_texts(texts);
+
+            Assert.AreEqual(27, tokenizer.word_index.Count);
+
+            var matrix = tokenizer.texts_to_matrix(texts, mode:"count");
+
+            Assert.AreEqual(texts.Length, matrix.shape[0]);
+
+            CompareLists(new double[] { 0, 2, 2, 2, 1, 2, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, matrix[0].ToArray<double>());
+            CompareLists(new double[] { 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, matrix[1].ToArray<double>());
+        }
+
+        [TestMethod]
+        public void TextToMatrixFrequency()
+        {
+            var tokenizer = keras.preprocessing.text.Tokenizer();
+            tokenizer.fit_on_texts(texts);
+
+            Assert.AreEqual(27, tokenizer.word_index.Count);
+
+            var matrix = tokenizer.texts_to_matrix(texts, mode: "freq");
+
+            Assert.AreEqual(texts.Length, matrix.shape[0]);
+
+            double t12 = 2.0 / 12.0;
+            double o12 = 1.0 / 12.0;
+            double t22 = 2.0 / 22.0;
+            double o22 = 1.0 / 22.0;
+
+            CompareLists(new double[] { 0, t12, t12, t12, o12, t12, t12, o12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, matrix[0].ToArray<double>());
+            CompareLists(new double[] { 0, 0, 0, 0, 0, o22, 0, 0, o22, o22, o22, o22, o22, o22, o22, o22, t22, o22, o22, o22, o22, o22, o22, o22, o22, o22, o22, o22 }, matrix[1].ToArray<double>());
+        }
+
+        [TestMethod]
+        public void TextToMatrixTDIDF()
+        {
+            var tokenizer = keras.preprocessing.text.Tokenizer();
+            tokenizer.fit_on_texts(texts);
+
+            Assert.AreEqual(27, tokenizer.word_index.Count);
+
+            var matrix = tokenizer.texts_to_matrix(texts, mode: "tfidf");
+
+            Assert.AreEqual(texts.Length, matrix.shape[0]);
+
+            double t1 = 1.1736001944781467;
+            double t2 = 0.69314718055994529;
+            double t3 = 1.860112299086919;
+            double t4 = 1.0986122886681098;
+            double t5 = 0.69314718055994529;
+
+            CompareLists(new double[] { 0, t1, t1, t1, t2, 0, t1, t2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, matrix[0].ToArray<double>());
+            CompareLists(new double[] { 0, 0, 0, 0, 0, 0, 0, 0, t5, t5, t5, t5, t5, t5, t5, t5, t3, t4, t4, t4, t4, t4, t4, t4, t4, t4, t4, t4 }, matrix[1].ToArray<double>());
+        }
+
+        private void CompareLists<T>(IList<T> expected, IList<T> actual)
+        {
+            Assert.AreEqual(expected.Count, actual.Count);
+            for (var i = 0; i < expected.Count; i++)
+            {
+                Assert.AreEqual(expected[i], actual[i]);
+            }
+        }
+
+    }
+}