Browse Source

Adding subset of text preprocessing Keras APis and unit tests.

tags/v0.40-tf2.4-tstring
Niklas Gustafsson Esther Hu 4 years ago
parent
commit
e1b1fafb0a
5 changed files with 917 additions and 8 deletions
  1. +4
    -0
      src/TensorFlowNET.Keras/Preprocessings/Preprocessing.cs
  2. +444
    -0
      src/TensorFlowNET.Keras/Preprocessings/Tokenizer.cs
  3. +21
    -8
      src/TensorFlowNET.Keras/Sequence.cs
  4. +35
    -0
      src/TensorFlowNET.Keras/TextApi.cs
  5. +413
    -0
      test/TensorFlowNET.Keras.UnitTest/PreprocessingTests.cs

+ 4
- 0
src/TensorFlowNET.Keras/Preprocessings/Preprocessing.cs View File

@@ -10,6 +10,10 @@ namespace Tensorflow.Keras
public Sequence sequence => new Sequence();
public DatasetUtils dataset_utils => new DatasetUtils();

public TextApi text => _text;

private static TextApi _text = new TextApi();

public TextVectorization TextVectorization(Func<Tensor, Tensor> standardize = null,
string split = "whitespace",
int max_tokens = -1,


+ 444
- 0
src/TensorFlowNET.Keras/Preprocessings/Tokenizer.cs View File

@@ -0,0 +1,444 @@
using NumSharp;
using Serilog.Debugging;
using System;
using System.Collections.Generic;
using System.Collections.Specialized;
using System.Data.SqlTypes;
using System.Linq;
using System.Net.Sockets;
using System.Text;

namespace Tensorflow.Keras.Text
{
/// <summary>
/// Text tokenization API.
/// This class allows to vectorize a text corpus, by turning each text into either a sequence of integers
/// (each integer being the index of a token in a dictionary) or into a vector where the coefficient for
/// each token could be binary, based on word count, based on tf-idf...
/// </summary>
/// <remarks>
/// This code is a fairly straight port of the Python code for Keras text preprocessing found at:
/// https://github.com/keras-team/keras-preprocessing/blob/master/keras_preprocessing/text.py
/// </remarks>
public class Tokenizer
{
private readonly int num_words;
private readonly string filters;
private readonly bool lower;
private readonly char split;
private readonly bool char_level;
private readonly string oov_token;
private readonly Func<string, IEnumerable<string>> analyzer;

private int document_count = 0;

private Dictionary<string, int> word_docs = new Dictionary<string, int>();
private Dictionary<string, int> word_counts = new Dictionary<string, int>();

public Dictionary<string, int> word_index = null;
public Dictionary<int, string> index_word = null;

private Dictionary<int, int> index_docs = null;

public Tokenizer(
int num_words = -1,
string filters = "!\"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n",
bool lower = true,
char split = ' ',
bool char_level = false,
string oov_token = null,
Func<string, IEnumerable<string>> analyzer = null)
{
this.num_words = num_words;
this.filters = filters;
this.lower = lower;
this.split = split;
this.char_level = char_level;
this.oov_token = oov_token;
this.analyzer = analyzer != null ? analyzer : (text) => TextApi.text_to_word_sequence(text, filters, lower, split);
}

/// <summary>
/// Updates internal vocabulary based on a list of texts.
/// </summary>
/// <param name="texts">A list of strings, each containing one or more tokens.</param>
/// <remarks>Required before using texts_to_sequences or texts_to_matrix.</remarks>
public void fit_on_texts(IEnumerable<string> texts)
{
foreach (var text in texts)
{
IEnumerable<string> seq = null;

document_count += 1;
if (char_level)
{
throw new NotImplementedException("char_level == true");
}
else
{
seq = analyzer(lower ? text.ToLower() : text);
}

foreach (var w in seq)
{
var count = 0;
word_counts.TryGetValue(w, out count);
word_counts[w] = count + 1;
}

foreach (var w in new HashSet<string>(seq))
{
var count = 0;
word_docs.TryGetValue(w, out count);
word_docs[w] = count + 1;
}
}

var wcounts = word_counts.AsEnumerable().ToList();
wcounts.Sort((kv1, kv2) => -kv1.Value.CompareTo(kv2.Value)); // Note: '-' gives us descending order.

var sorted_voc = (oov_token == null) ? new List<string>() : new List<string>() { oov_token };
sorted_voc.AddRange(word_counts.Select(kv => kv.Key));

if (num_words > 0 - 1)
{
sorted_voc = sorted_voc.Take<string>((oov_token == null) ? num_words : num_words + 1).ToList();
}

word_index = new Dictionary<string, int>(sorted_voc.Count);
index_word = new Dictionary<int, string>(sorted_voc.Count);
index_docs = new Dictionary<int, int>(word_docs.Count);

for (int i = 0; i < sorted_voc.Count; i++)
{
word_index.Add(sorted_voc[i], i + 1);
index_word.Add(i + 1, sorted_voc[i]);
}

foreach (var kv in word_docs)
{
var idx = -1;
if (word_index.TryGetValue(kv.Key, out idx))
{
index_docs.Add(idx, kv.Value);
}
}
}

/// <summary>
/// Updates internal vocabulary based on a list of texts.
/// </summary>
/// <param name="texts">A list of list of strings, each containing one token.</param>
/// <remarks>Required before using texts_to_sequences or texts_to_matrix.</remarks>
public void fit_on_texts(IEnumerable<IEnumerable<string>> texts)
{
foreach (var seq in texts)
{
foreach (var w in seq.Select(s => lower ? s.ToLower() : s))
{
var count = 0;
word_counts.TryGetValue(w, out count);
word_counts[w] = count + 1;
}

foreach (var w in new HashSet<string>(word_counts.Keys))
{
var count = 0;
word_docs.TryGetValue(w, out count);
word_docs[w] = count + 1;
}
}

var wcounts = word_counts.AsEnumerable().ToList();
wcounts.Sort((kv1, kv2) => -kv1.Value.CompareTo(kv2.Value));

var sorted_voc = (oov_token == null) ? new List<string>() : new List<string>() { oov_token };
sorted_voc.AddRange(word_counts.Select(kv => kv.Key));

if (num_words > 0 - 1)
{
sorted_voc = sorted_voc.Take<string>((oov_token == null) ? num_words : num_words + 1).ToList();
}

word_index = new Dictionary<string, int>(sorted_voc.Count);
index_word = new Dictionary<int, string>(sorted_voc.Count);
index_docs = new Dictionary<int, int>(word_docs.Count);

for (int i = 0; i < sorted_voc.Count; i++)
{
word_index.Add(sorted_voc[i], i + 1);
index_word.Add(i + 1, sorted_voc[i]);
}

foreach (var kv in word_docs)
{
var idx = -1;
if (word_index.TryGetValue(kv.Key, out idx))
{
index_docs.Add(idx, kv.Value);
}
}
}

/// <summary>
/// Updates internal vocabulary based on a list of sequences.
/// </summary>
/// <param name="sequences"></param>
/// <remarks>Required before using sequences_to_matrix (if fit_on_texts was never called).</remarks>
public void fit_on_sequences(IEnumerable<int[]> sequences)
{
throw new NotImplementedException("fit_on_sequences");
}

/// <summary>
/// Transforms each string in texts to a sequence of integers.
/// </summary>
/// <param name="texts"></param>
/// <returns></returns>
/// <remarks>Only top num_words-1 most frequent words will be taken into account.Only words known by the tokenizer will be taken into account.</remarks>
public IList<int[]> texts_to_sequences(IEnumerable<string> texts)
{
return texts_to_sequences_generator(texts).ToArray();
}

/// <summary>
/// Transforms each token in texts to a sequence of integers.
/// </summary>
/// <param name="texts"></param>
/// <returns></returns>
/// <remarks>Only top num_words-1 most frequent words will be taken into account.Only words known by the tokenizer will be taken into account.</remarks>
public IList<int[]> texts_to_sequences(IEnumerable<IEnumerable<string>> texts)
{
return texts_to_sequences_generator(texts).ToArray();
}

public IEnumerable<int[]> texts_to_sequences_generator(IEnumerable<string> texts)
{
int oov_index = -1;
var _ = (oov_token != null) && word_index.TryGetValue(oov_token, out oov_index);

return texts.Select(text =>
{
IEnumerable<string> seq = null;

if (char_level)
{
throw new NotImplementedException("char_level == true");
}
else
{
seq = analyzer(lower ? text.ToLower() : text);
}

return ConvertToSequence(oov_index, seq).ToArray();
});
}

public IEnumerable<int[]> texts_to_sequences_generator(IEnumerable<IEnumerable<string>> texts)
{
int oov_index = -1;
var _ = (oov_token != null) && word_index.TryGetValue(oov_token, out oov_index);
return texts.Select(seq => ConvertToSequence(oov_index, seq).ToArray());
}

private List<int> ConvertToSequence(int oov_index, IEnumerable<string> seq)
{
var vect = new List<int>();
foreach (var w in seq.Select(s => lower ? s.ToLower() : s))
{
var i = -1;
if (word_index.TryGetValue(w, out i))
{
if (num_words != -1 && i >= num_words)
{
if (oov_index != -1)
{
vect.Add(oov_index);
}
}
else
{
vect.Add(i);
}
}
else if (oov_index != -1)
{
vect.Add(oov_index);
}
}

return vect;
}

/// <summary>
/// Transforms each sequence into a list of text.
/// </summary>
/// <param name="sequences"></param>
/// <returns>A list of texts(strings)</returns>
/// <remarks>Only top num_words-1 most frequent words will be taken into account.Only words known by the tokenizer will be taken into account.</remarks>
public IList<string> sequences_to_texts(IEnumerable<int[]> sequences)
{
return sequences_to_texts_generator(sequences).ToArray();
}

public IEnumerable<string> sequences_to_texts_generator(IEnumerable<IList<int>> sequences)
{
int oov_index = -1;
var _ = (oov_token != null) && word_index.TryGetValue(oov_token, out oov_index);

return sequences.Select(seq =>
{

var bldr = new StringBuilder();
for (var i = 0; i < seq.Count; i++)
{
if (i > 0) bldr.Append(' ');

string word = null;
if (index_word.TryGetValue(seq[i], out word))
{
if (num_words != -1 && i >= num_words)
{
if (oov_index != -1)
{
bldr.Append(oov_token);
}
}
else
{
bldr.Append(word);
}
}
else if (oov_index != -1)
{
bldr.Append(oov_token);
}
}

return bldr.ToString();
});
}

/// <summary>
/// Convert a list of texts to a Numpy matrix.
/// </summary>
/// <param name="texts">A sequence of strings containing one or more tokens.</param>
/// <param name="mode">One of "binary", "count", "tfidf", "freq".</param>
/// <returns></returns>
public NDArray texts_to_matrix(IEnumerable<string> texts, string mode = "binary")
{
return sequences_to_matrix(texts_to_sequences(texts), mode);
}

/// <summary>
/// Convert a list of texts to a Numpy matrix.
/// </summary>
/// <param name="texts">A sequence of lists of strings, each containing one token.</param>
/// <param name="mode">One of "binary", "count", "tfidf", "freq".</param>
/// <returns></returns>
public NDArray texts_to_matrix(IEnumerable<IList<string>> texts, string mode = "binary")
{
return sequences_to_matrix(texts_to_sequences(texts), mode);
}

/// <summary>
/// Converts a list of sequences into a Numpy matrix.
/// </summary>
/// <param name="sequences">A sequence of lists of integers, encoding tokens.</param>
/// <param name="mode">One of "binary", "count", "tfidf", "freq".</param>
/// <returns></returns>
public NDArray sequences_to_matrix(IEnumerable<IList<int>> sequences, string mode = "binary")
{
if (!modes.Contains(mode)) throw new InvalidArgumentError($"Unknown vectorization mode: {mode}");
var word_count = 0;

if (num_words == -1)
{
if (word_index != null)
{
word_count = word_index.Count + 1;
}
else
{
throw new InvalidOperationException("Specifya dimension ('num_words' arugment), or fit on some text data first.");
}
}
else
{
word_count = num_words;
}

if (mode == "tfidf" && this.document_count == 0)
{
throw new InvalidOperationException("Fit the Tokenizer on some text data before using the 'tfidf' mode.");
}

var x = np.zeros(sequences.Count(), word_count);

for (int i = 0; i < sequences.Count(); i++)
{
var seq = sequences.ElementAt(i);
if (seq == null || seq.Count == 0)
continue;

var counts = new Dictionary<int, int>();

var seq_length = seq.Count;

foreach (var j in seq)
{
if (j >= word_count)
continue;
var count = 0;
counts.TryGetValue(j, out count);
counts[j] = count + 1;
}

if (mode == "count")
{
foreach (var kv in counts)
{
var j = kv.Key;
var c = kv.Value;
x[i, j] = c;
}
}
else if (mode == "freq")
{
foreach (var kv in counts)
{
var j = kv.Key;
var c = kv.Value;
x[i, j] = ((double)c) / seq_length;
}
}
else if (mode == "binary")
{
foreach (var kv in counts)
{
var j = kv.Key;
var c = kv.Value;
x[i, j] = 1;
}
}
else if (mode == "tfidf")
{
foreach (var kv in counts)
{
var j = kv.Key;
var c = kv.Value;
var id = 0;
var _ = index_docs.TryGetValue(j, out id);
var tf = 1 + np.log(c);
var idf = np.log(1 + document_count / (1 + id));
x[i, j] = tf * idf;
}
}
}

return x;
}

private string[] modes = new string[] { "binary", "count", "tfidf", "freq" };
}
}

+ 21
- 8
src/TensorFlowNET.Keras/Sequence.cs View File

@@ -15,7 +15,9 @@
******************************************************************************/

using NumSharp;
using NumSharp.Utilities;
using System;
using System.Collections.Generic;
using System.Linq;

namespace Tensorflow.Keras
@@ -34,14 +36,18 @@ namespace Tensorflow.Keras
/// <param name="truncating">String, 'pre' or 'post'</param>
/// <param name="value">Float or String, padding value.</param>
/// <returns></returns>
public NDArray pad_sequences(NDArray sequences,
public NDArray pad_sequences(IEnumerable<int[]> sequences,
int? maxlen = null,
string dtype = "int32",
string padding = "pre",
string truncating = "pre",
object value = null)
{
int[] length = new int[sequences.size];
if (value != null) throw new NotImplementedException("padding with a specific value.");
if (padding != "pre" && padding != "post") throw new InvalidArgumentError("padding must be 'pre' or 'post'.");
if (truncating != "pre" && truncating != "post") throw new InvalidArgumentError("truncating must be 'pre' or 'post'.");

var length = sequences.Select(s => s.Length);

if (maxlen == null)
maxlen = length.Max();
@@ -49,19 +55,26 @@ namespace Tensorflow.Keras
if (value == null)
value = 0f;

var nd = new NDArray(np.int32, new Shape(sequences.size, maxlen.Value));
#pragma warning disable CS0162 // Unreachable code detected
var type = getNPType(dtype);
var nd = new NDArray(type, new Shape(length.Count(), maxlen.Value), true);

for (int i = 0; i < nd.shape[0]; i++)
#pragma warning restore CS0162 // Unreachable code detected
{
switch (sequences[i])
var s = sequences.ElementAt(i);
if (s.Length > maxlen.Value)
{
default:
throw new NotImplementedException("pad_sequences");
s = (truncating == "pre") ? s.Slice(s.Length - maxlen.Value, s.Length) : s.Slice(0, maxlen.Value);
}
var sliceString = (padding == "pre") ? $"{i},{maxlen - s.Length}:" : $"{i},:{s.Length}";
nd[sliceString] = np.array(s);
}

return nd;
}

private Type getNPType(string typeName)
{
return System.Type.GetType("NumSharp.np,NumSharp").GetField(typeName).GetValue(null) as Type;
}
}
}

+ 35
- 0
src/TensorFlowNET.Keras/TextApi.cs View File

@@ -0,0 +1,35 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using Tensorflow.Keras.Text;

namespace Tensorflow.Keras
{
public class TextApi
{
public Tensorflow.Keras.Text.Tokenizer Tokenizer(
int num_words = -1,
string filters = DefaultFilter,
bool lower = true,
char split = ' ',
bool char_level = false,
string oov_token = null,
Func<string, IEnumerable<string>> analyzer = null)
{
return new Keras.Text.Tokenizer(num_words, filters, lower, split, char_level, oov_token, analyzer);
}

public static IEnumerable<string> text_to_word_sequence(string text, string filters = DefaultFilter, bool lower = true, char split = ' ')
{
if (lower)
{
text = text.ToLower();
}
var newText = new String(text.Where(c => !filters.Contains(c)).ToArray());
return newText.Split(split);
}

private const string DefaultFilter = "!\"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n";
}
}

+ 413
- 0
test/TensorFlowNET.Keras.UnitTest/PreprocessingTests.cs View File

@@ -0,0 +1,413 @@
using Microsoft.VisualStudio.TestTools.UnitTesting;
using System;
using System.Linq;
using System.Collections.Generic;
using System.Text;
using NumSharp;
using static Tensorflow.KerasApi;
using Tensorflow;
using Tensorflow.Keras.Datasets;
using Microsoft.Extensions.DependencyInjection;

namespace TensorFlowNET.Keras.UnitTest
{
[TestClass]
public class PreprocessingTests : EagerModeTestBase
{
private readonly string[] texts = new string[] {
"It was the best of times, it was the worst of times.",
"Mr and Mrs Dursley of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much.",
"It was the best of times, it was the worst of times.",
"Mr and Mrs Dursley of number four, Privet Drive.",
};

private readonly string[][] tokenized_texts = new string[][] {
new string[] {"It","was","the","best","of","times","it","was","the","worst","of","times"},
new string[] {"mr","and","mrs","dursley","of","number","four","privet","drive","were","proud","to","say","that","they","were","perfectly","normal","thank","you","very","much"},
new string[] {"It","was","the","best","of","times","it","was","the","worst","of","times"},
new string[] {"mr","and","mrs","dursley","of","number","four","privet","drive"},
};

private readonly string[] processed_texts = new string[] {
"it was the best of times it was the worst of times",
"mr and mrs dursley of number four privet drive were proud to say that they were perfectly normal thank you very much",
"it was the best of times it was the worst of times",
"mr and mrs dursley of number four privet drive",
};

private const string OOV = "<OOV>";

[TestMethod]
public void TokenizeWithNoOOV()
{
var tokenizer = keras.preprocessing.text.Tokenizer();
tokenizer.fit_on_texts(texts);

Assert.AreEqual(27, tokenizer.word_index.Count);

Assert.AreEqual(7, tokenizer.word_index["worst"]);
Assert.AreEqual(12, tokenizer.word_index["number"]);
Assert.AreEqual(16, tokenizer.word_index["were"]);
}

[TestMethod]
public void TokenizeWithNoOOV_Tkn()
{
var tokenizer = keras.preprocessing.text.Tokenizer();
// Use the list version, where the tokenization has already been done.
tokenizer.fit_on_texts(tokenized_texts);

Assert.AreEqual(27, tokenizer.word_index.Count);

Assert.AreEqual(7, tokenizer.word_index["worst"]);
Assert.AreEqual(12, tokenizer.word_index["number"]);
Assert.AreEqual(16, tokenizer.word_index["were"]);
}

[TestMethod]
public void TokenizeWithOOV()
{
var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV);
tokenizer.fit_on_texts(texts);

Assert.AreEqual(28, tokenizer.word_index.Count);

Assert.AreEqual(1, tokenizer.word_index[OOV]);
Assert.AreEqual(8, tokenizer.word_index["worst"]);
Assert.AreEqual(13, tokenizer.word_index["number"]);
Assert.AreEqual(17, tokenizer.word_index["were"]);
}

[TestMethod]
public void TokenizeWithOOV_Tkn()
{
var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV);
// Use the list version, where the tokenization has already been done.
tokenizer.fit_on_texts(tokenized_texts);

Assert.AreEqual(28, tokenizer.word_index.Count);

Assert.AreEqual(1, tokenizer.word_index[OOV]);
Assert.AreEqual(8, tokenizer.word_index["worst"]);
Assert.AreEqual(13, tokenizer.word_index["number"]);
Assert.AreEqual(17, tokenizer.word_index["were"]);
}

[TestMethod]
public void TokenizeTextsToSequences()
{
var tokenizer = keras.preprocessing.text.Tokenizer();
tokenizer.fit_on_texts(texts);

var sequences = tokenizer.texts_to_sequences(texts);
Assert.AreEqual(4, sequences.Count);

Assert.AreEqual(tokenizer.word_index["worst"], sequences[0][9]);
Assert.AreEqual(tokenizer.word_index["proud"], sequences[1][10]);
}

[TestMethod]
public void TokenizeTextsToSequences_Tkn()
{
var tokenizer = keras.preprocessing.text.Tokenizer();
// Use the list version, where the tokenization has already been done.
tokenizer.fit_on_texts(tokenized_texts);

var sequences = tokenizer.texts_to_sequences(tokenized_texts);
Assert.AreEqual(4, sequences.Count);

Assert.AreEqual(tokenizer.word_index["worst"], sequences[0][9]);
Assert.AreEqual(tokenizer.word_index["proud"], sequences[1][10]);
}

[TestMethod]
public void TokenizeTextsToSequencesAndBack()
{
var tokenizer = keras.preprocessing.text.Tokenizer();
tokenizer.fit_on_texts(texts);

var sequences = tokenizer.texts_to_sequences(texts);
Assert.AreEqual(4, sequences.Count);

var processed = tokenizer.sequences_to_texts(sequences);

Assert.AreEqual(4, processed.Count);

for (var i = 0; i < processed.Count; i++)
Assert.AreEqual(processed_texts[i], processed[i]);
}

[TestMethod]
public void TokenizeTextsToSequencesAndBack_Tkn1()
{
var tokenizer = keras.preprocessing.text.Tokenizer();
// Use the list version, where the tokenization has already been done.
tokenizer.fit_on_texts(tokenized_texts);

// Use the list version, where the tokenization has already been done.
var sequences = tokenizer.texts_to_sequences(tokenized_texts);
Assert.AreEqual(4, sequences.Count);

var processed = tokenizer.sequences_to_texts(sequences);

Assert.AreEqual(4, processed.Count);

for (var i = 0; i < processed.Count; i++)
Assert.AreEqual(processed_texts[i], processed[i]);
}

[TestMethod]
public void TokenizeTextsToSequencesAndBack_Tkn2()
{
var tokenizer = keras.preprocessing.text.Tokenizer();
// Use the list version, where the tokenization has already been done.
tokenizer.fit_on_texts(tokenized_texts);

var sequences = tokenizer.texts_to_sequences(texts);
Assert.AreEqual(4, sequences.Count);

var processed = tokenizer.sequences_to_texts(sequences);

Assert.AreEqual(4, processed.Count);

for (var i = 0; i < processed.Count; i++)
Assert.AreEqual(processed_texts[i], processed[i]);
}

[TestMethod]
public void TokenizeTextsToSequencesAndBack_Tkn3()
{
var tokenizer = keras.preprocessing.text.Tokenizer();
tokenizer.fit_on_texts(texts);

// Use the list version, where the tokenization has already been done.
var sequences = tokenizer.texts_to_sequences(tokenized_texts);
Assert.AreEqual(4, sequences.Count);

var processed = tokenizer.sequences_to_texts(sequences);

Assert.AreEqual(4, processed.Count);

for (var i = 0; i < processed.Count; i++)
Assert.AreEqual(processed_texts[i], processed[i]);
}
[TestMethod]
public void TokenizeTextsToSequencesWithOOV()
{
var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV);
tokenizer.fit_on_texts(texts);

var sequences = tokenizer.texts_to_sequences(texts);
Assert.AreEqual(4, sequences.Count);

Assert.AreEqual(tokenizer.word_index["worst"], sequences[0][9]);
Assert.AreEqual(tokenizer.word_index["proud"], sequences[1][10]);

for (var i = 0; i < sequences.Count; i++)
for (var j = 0; j < sequences[i].Length; j++)
Assert.AreNotEqual(tokenizer.word_index[OOV], sequences[i][j]);
}

[TestMethod]
public void TokenizeTextsToSequencesWithOOVPresent()
{
var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV, num_words:20);
tokenizer.fit_on_texts(texts);

var sequences = tokenizer.texts_to_sequences(texts);
Assert.AreEqual(4, sequences.Count);

Assert.AreEqual(tokenizer.word_index["worst"], sequences[0][9]);
Assert.AreEqual(tokenizer.word_index["proud"], sequences[1][10]);

var oov_count = 0;
for (var i = 0; i < sequences.Count; i++)
for (var j = 0; j < sequences[i].Length; j++)
if (tokenizer.word_index[OOV] == sequences[i][j])
oov_count += 1;

Assert.AreEqual(9, oov_count);
}

[TestMethod]
public void PadSequencesWithDefaults()
{
var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV);
tokenizer.fit_on_texts(texts);

var sequences = tokenizer.texts_to_sequences(texts);
var padded = keras.preprocessing.sequence.pad_sequences(sequences);

Assert.AreEqual(4, padded.shape[0]);
Assert.AreEqual(22, padded.shape[1]);

Assert.AreEqual(tokenizer.word_index["worst"], padded[0, 19].GetInt32());
for (var i = 0; i < 8; i++)
Assert.AreEqual(0, padded[0, i].GetInt32());
Assert.AreEqual(tokenizer.word_index["proud"], padded[1, 10].GetInt32());
for (var i = 0; i < 20; i++)
Assert.AreNotEqual(0, padded[1, i].GetInt32());
}

[TestMethod]
public void PadSequencesPrePaddingTrunc()
{
var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV);
tokenizer.fit_on_texts(texts);

var sequences = tokenizer.texts_to_sequences(texts);
var padded = keras.preprocessing.sequence.pad_sequences(sequences,maxlen:15);

Assert.AreEqual(4, padded.shape[0]);
Assert.AreEqual(15, padded.shape[1]);

Assert.AreEqual(tokenizer.word_index["worst"], padded[0, 12].GetInt32());
for (var i = 0; i < 3; i++)
Assert.AreEqual(0, padded[0, i].GetInt32());
Assert.AreEqual(tokenizer.word_index["proud"], padded[1, 3].GetInt32());
for (var i = 0; i < 15; i++)
Assert.AreNotEqual(0, padded[1, i].GetInt32());
}

[TestMethod]
public void PadSequencesPrePaddingTrunc_Larger()
{
var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV);
tokenizer.fit_on_texts(texts);

var sequences = tokenizer.texts_to_sequences(texts);
var padded = keras.preprocessing.sequence.pad_sequences(sequences, maxlen: 45);

Assert.AreEqual(4, padded.shape[0]);
Assert.AreEqual(45, padded.shape[1]);

Assert.AreEqual(tokenizer.word_index["worst"], padded[0, 42].GetInt32());
for (var i = 0; i < 33; i++)
Assert.AreEqual(0, padded[0, i].GetInt32());
Assert.AreEqual(tokenizer.word_index["proud"], padded[1, 33].GetInt32());
}

[TestMethod]
public void PadSequencesPostPaddingTrunc()
{
var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV);
tokenizer.fit_on_texts(texts);

var sequences = tokenizer.texts_to_sequences(texts);
var padded = keras.preprocessing.sequence.pad_sequences(sequences, maxlen: 15, padding: "post", truncating: "post");

Assert.AreEqual(4, padded.shape[0]);
Assert.AreEqual(15, padded.shape[1]);

Assert.AreEqual(tokenizer.word_index["worst"], padded[0, 9].GetInt32());
for (var i = 12; i < 15; i++)
Assert.AreEqual(0, padded[0, i].GetInt32());
Assert.AreEqual(tokenizer.word_index["proud"], padded[1, 10].GetInt32());
for (var i = 0; i < 15; i++)
Assert.AreNotEqual(0, padded[1, i].GetInt32());
}

[TestMethod]
public void PadSequencesPostPaddingTrunc_Larger()
{
var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV);
tokenizer.fit_on_texts(texts);

var sequences = tokenizer.texts_to_sequences(texts);
var padded = keras.preprocessing.sequence.pad_sequences(sequences, maxlen: 45, padding: "post", truncating: "post");

Assert.AreEqual(4, padded.shape[0]);
Assert.AreEqual(45, padded.shape[1]);

Assert.AreEqual(tokenizer.word_index["worst"], padded[0, 9].GetInt32());
for (var i = 32; i < 45; i++)
Assert.AreEqual(0, padded[0, i].GetInt32());
Assert.AreEqual(tokenizer.word_index["proud"], padded[1, 10].GetInt32());
}

[TestMethod]
public void TextToMatrixBinary()
{
var tokenizer = keras.preprocessing.text.Tokenizer();
tokenizer.fit_on_texts(texts);

Assert.AreEqual(27, tokenizer.word_index.Count);

var matrix = tokenizer.texts_to_matrix(texts);

Assert.AreEqual(texts.Length, matrix.shape[0]);

CompareLists(new double[] { 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, matrix[0].ToArray<double>());
CompareLists(new double[] { 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, matrix[1].ToArray<double>());
}

[TestMethod]
public void TextToMatrixCount()
{
var tokenizer = keras.preprocessing.text.Tokenizer();
tokenizer.fit_on_texts(texts);

Assert.AreEqual(27, tokenizer.word_index.Count);

var matrix = tokenizer.texts_to_matrix(texts, mode:"count");

Assert.AreEqual(texts.Length, matrix.shape[0]);

CompareLists(new double[] { 0, 2, 2, 2, 1, 2, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, matrix[0].ToArray<double>());
CompareLists(new double[] { 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, matrix[1].ToArray<double>());
}

[TestMethod]
public void TextToMatrixFrequency()
{
var tokenizer = keras.preprocessing.text.Tokenizer();
tokenizer.fit_on_texts(texts);

Assert.AreEqual(27, tokenizer.word_index.Count);

var matrix = tokenizer.texts_to_matrix(texts, mode: "freq");

Assert.AreEqual(texts.Length, matrix.shape[0]);

double t12 = 2.0 / 12.0;
double o12 = 1.0 / 12.0;
double t22 = 2.0 / 22.0;
double o22 = 1.0 / 22.0;

CompareLists(new double[] { 0, t12, t12, t12, o12, t12, t12, o12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, matrix[0].ToArray<double>());
CompareLists(new double[] { 0, 0, 0, 0, 0, o22, 0, 0, o22, o22, o22, o22, o22, o22, o22, o22, t22, o22, o22, o22, o22, o22, o22, o22, o22, o22, o22, o22 }, matrix[1].ToArray<double>());
}

[TestMethod]
public void TextToMatrixTDIDF()
{
var tokenizer = keras.preprocessing.text.Tokenizer();
tokenizer.fit_on_texts(texts);

Assert.AreEqual(27, tokenizer.word_index.Count);

var matrix = tokenizer.texts_to_matrix(texts, mode: "tfidf");

Assert.AreEqual(texts.Length, matrix.shape[0]);

double t1 = 1.1736001944781467;
double t2 = 0.69314718055994529;
double t3 = 1.860112299086919;
double t4 = 1.0986122886681098;
double t5 = 0.69314718055994529;

CompareLists(new double[] { 0, t1, t1, t1, t2, 0, t1, t2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, matrix[0].ToArray<double>());
CompareLists(new double[] { 0, 0, 0, 0, 0, 0, 0, 0, t5, t5, t5, t5, t5, t5, t5, t5, t3, t4, t4, t4, t4, t4, t4, t4, t4, t4, t4, t4 }, matrix[1].ToArray<double>());
}

private void CompareLists<T>(IList<T> expected, IList<T> actual)
{
Assert.AreEqual(expected.Count, actual.Count);
for (var i = 0; i < expected.Count; i++)
{
Assert.AreEqual(expected[i], actual[i]);
}
}

}
}

Loading…
Cancel
Save