Merge from master repotags/v0.40-tf2.4-tstring
@@ -10,6 +10,10 @@ namespace Tensorflow.Keras | |||
public Sequence sequence => new Sequence(); | |||
public DatasetUtils dataset_utils => new DatasetUtils(); | |||
public TextApi text => _text; | |||
private static TextApi _text = new TextApi(); | |||
public TextVectorization TextVectorization(Func<Tensor, Tensor> standardize = null, | |||
string split = "whitespace", | |||
int max_tokens = -1, | |||
@@ -0,0 +1,444 @@ | |||
using NumSharp; | |||
using Serilog.Debugging; | |||
using System; | |||
using System.Collections.Generic; | |||
using System.Collections.Specialized; | |||
using System.Data.SqlTypes; | |||
using System.Linq; | |||
using System.Net.Sockets; | |||
using System.Text; | |||
namespace Tensorflow.Keras.Text | |||
{ | |||
/// <summary> | |||
/// Text tokenization API. | |||
/// This class allows to vectorize a text corpus, by turning each text into either a sequence of integers | |||
/// (each integer being the index of a token in a dictionary) or into a vector where the coefficient for | |||
/// each token could be binary, based on word count, based on tf-idf... | |||
/// </summary> | |||
/// <remarks> | |||
/// This code is a fairly straight port of the Python code for Keras text preprocessing found at: | |||
/// https://github.com/keras-team/keras-preprocessing/blob/master/keras_preprocessing/text.py | |||
/// </remarks> | |||
public class Tokenizer | |||
{ | |||
private readonly int num_words; | |||
private readonly string filters; | |||
private readonly bool lower; | |||
private readonly char split; | |||
private readonly bool char_level; | |||
private readonly string oov_token; | |||
private readonly Func<string, IEnumerable<string>> analyzer; | |||
private int document_count = 0; | |||
private Dictionary<string, int> word_docs = new Dictionary<string, int>(); | |||
private Dictionary<string, int> word_counts = new Dictionary<string, int>(); | |||
public Dictionary<string, int> word_index = null; | |||
public Dictionary<int, string> index_word = null; | |||
private Dictionary<int, int> index_docs = null; | |||
public Tokenizer( | |||
int num_words = -1, | |||
string filters = "!\"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n", | |||
bool lower = true, | |||
char split = ' ', | |||
bool char_level = false, | |||
string oov_token = null, | |||
Func<string, IEnumerable<string>> analyzer = null) | |||
{ | |||
this.num_words = num_words; | |||
this.filters = filters; | |||
this.lower = lower; | |||
this.split = split; | |||
this.char_level = char_level; | |||
this.oov_token = oov_token; | |||
this.analyzer = analyzer != null ? analyzer : (text) => TextApi.text_to_word_sequence(text, filters, lower, split); | |||
} | |||
/// <summary> | |||
/// Updates internal vocabulary based on a list of texts. | |||
/// </summary> | |||
/// <param name="texts">A list of strings, each containing one or more tokens.</param> | |||
/// <remarks>Required before using texts_to_sequences or texts_to_matrix.</remarks> | |||
public void fit_on_texts(IEnumerable<string> texts) | |||
{ | |||
foreach (var text in texts) | |||
{ | |||
IEnumerable<string> seq = null; | |||
document_count += 1; | |||
if (char_level) | |||
{ | |||
throw new NotImplementedException("char_level == true"); | |||
} | |||
else | |||
{ | |||
seq = analyzer(lower ? text.ToLower() : text); | |||
} | |||
foreach (var w in seq) | |||
{ | |||
var count = 0; | |||
word_counts.TryGetValue(w, out count); | |||
word_counts[w] = count + 1; | |||
} | |||
foreach (var w in new HashSet<string>(seq)) | |||
{ | |||
var count = 0; | |||
word_docs.TryGetValue(w, out count); | |||
word_docs[w] = count + 1; | |||
} | |||
} | |||
var wcounts = word_counts.AsEnumerable().ToList(); | |||
wcounts.Sort((kv1, kv2) => -kv1.Value.CompareTo(kv2.Value)); // Note: '-' gives us descending order. | |||
var sorted_voc = (oov_token == null) ? new List<string>() : new List<string>() { oov_token }; | |||
sorted_voc.AddRange(word_counts.Select(kv => kv.Key)); | |||
if (num_words > 0 - 1) | |||
{ | |||
sorted_voc = sorted_voc.Take<string>((oov_token == null) ? num_words : num_words + 1).ToList(); | |||
} | |||
word_index = new Dictionary<string, int>(sorted_voc.Count); | |||
index_word = new Dictionary<int, string>(sorted_voc.Count); | |||
index_docs = new Dictionary<int, int>(word_docs.Count); | |||
for (int i = 0; i < sorted_voc.Count; i++) | |||
{ | |||
word_index.Add(sorted_voc[i], i + 1); | |||
index_word.Add(i + 1, sorted_voc[i]); | |||
} | |||
foreach (var kv in word_docs) | |||
{ | |||
var idx = -1; | |||
if (word_index.TryGetValue(kv.Key, out idx)) | |||
{ | |||
index_docs.Add(idx, kv.Value); | |||
} | |||
} | |||
} | |||
/// <summary> | |||
/// Updates internal vocabulary based on a list of texts. | |||
/// </summary> | |||
/// <param name="texts">A list of list of strings, each containing one token.</param> | |||
/// <remarks>Required before using texts_to_sequences or texts_to_matrix.</remarks> | |||
public void fit_on_texts(IEnumerable<IEnumerable<string>> texts) | |||
{ | |||
foreach (var seq in texts) | |||
{ | |||
foreach (var w in seq.Select(s => lower ? s.ToLower() : s)) | |||
{ | |||
var count = 0; | |||
word_counts.TryGetValue(w, out count); | |||
word_counts[w] = count + 1; | |||
} | |||
foreach (var w in new HashSet<string>(word_counts.Keys)) | |||
{ | |||
var count = 0; | |||
word_docs.TryGetValue(w, out count); | |||
word_docs[w] = count + 1; | |||
} | |||
} | |||
var wcounts = word_counts.AsEnumerable().ToList(); | |||
wcounts.Sort((kv1, kv2) => -kv1.Value.CompareTo(kv2.Value)); | |||
var sorted_voc = (oov_token == null) ? new List<string>() : new List<string>() { oov_token }; | |||
sorted_voc.AddRange(word_counts.Select(kv => kv.Key)); | |||
if (num_words > 0 - 1) | |||
{ | |||
sorted_voc = sorted_voc.Take<string>((oov_token == null) ? num_words : num_words + 1).ToList(); | |||
} | |||
word_index = new Dictionary<string, int>(sorted_voc.Count); | |||
index_word = new Dictionary<int, string>(sorted_voc.Count); | |||
index_docs = new Dictionary<int, int>(word_docs.Count); | |||
for (int i = 0; i < sorted_voc.Count; i++) | |||
{ | |||
word_index.Add(sorted_voc[i], i + 1); | |||
index_word.Add(i + 1, sorted_voc[i]); | |||
} | |||
foreach (var kv in word_docs) | |||
{ | |||
var idx = -1; | |||
if (word_index.TryGetValue(kv.Key, out idx)) | |||
{ | |||
index_docs.Add(idx, kv.Value); | |||
} | |||
} | |||
} | |||
/// <summary> | |||
/// Updates internal vocabulary based on a list of sequences. | |||
/// </summary> | |||
/// <param name="sequences"></param> | |||
/// <remarks>Required before using sequences_to_matrix (if fit_on_texts was never called).</remarks> | |||
public void fit_on_sequences(IEnumerable<int[]> sequences) | |||
{ | |||
throw new NotImplementedException("fit_on_sequences"); | |||
} | |||
/// <summary> | |||
/// Transforms each string in texts to a sequence of integers. | |||
/// </summary> | |||
/// <param name="texts"></param> | |||
/// <returns></returns> | |||
/// <remarks>Only top num_words-1 most frequent words will be taken into account.Only words known by the tokenizer will be taken into account.</remarks> | |||
public IList<int[]> texts_to_sequences(IEnumerable<string> texts) | |||
{ | |||
return texts_to_sequences_generator(texts).ToArray(); | |||
} | |||
/// <summary> | |||
/// Transforms each token in texts to a sequence of integers. | |||
/// </summary> | |||
/// <param name="texts"></param> | |||
/// <returns></returns> | |||
/// <remarks>Only top num_words-1 most frequent words will be taken into account.Only words known by the tokenizer will be taken into account.</remarks> | |||
public IList<int[]> texts_to_sequences(IEnumerable<IEnumerable<string>> texts) | |||
{ | |||
return texts_to_sequences_generator(texts).ToArray(); | |||
} | |||
public IEnumerable<int[]> texts_to_sequences_generator(IEnumerable<string> texts) | |||
{ | |||
int oov_index = -1; | |||
var _ = (oov_token != null) && word_index.TryGetValue(oov_token, out oov_index); | |||
return texts.Select(text => | |||
{ | |||
IEnumerable<string> seq = null; | |||
if (char_level) | |||
{ | |||
throw new NotImplementedException("char_level == true"); | |||
} | |||
else | |||
{ | |||
seq = analyzer(lower ? text.ToLower() : text); | |||
} | |||
return ConvertToSequence(oov_index, seq).ToArray(); | |||
}); | |||
} | |||
public IEnumerable<int[]> texts_to_sequences_generator(IEnumerable<IEnumerable<string>> texts) | |||
{ | |||
int oov_index = -1; | |||
var _ = (oov_token != null) && word_index.TryGetValue(oov_token, out oov_index); | |||
return texts.Select(seq => ConvertToSequence(oov_index, seq).ToArray()); | |||
} | |||
private List<int> ConvertToSequence(int oov_index, IEnumerable<string> seq) | |||
{ | |||
var vect = new List<int>(); | |||
foreach (var w in seq.Select(s => lower ? s.ToLower() : s)) | |||
{ | |||
var i = -1; | |||
if (word_index.TryGetValue(w, out i)) | |||
{ | |||
if (num_words != -1 && i >= num_words) | |||
{ | |||
if (oov_index != -1) | |||
{ | |||
vect.Add(oov_index); | |||
} | |||
} | |||
else | |||
{ | |||
vect.Add(i); | |||
} | |||
} | |||
else if (oov_index != -1) | |||
{ | |||
vect.Add(oov_index); | |||
} | |||
} | |||
return vect; | |||
} | |||
/// <summary> | |||
/// Transforms each sequence into a list of text. | |||
/// </summary> | |||
/// <param name="sequences"></param> | |||
/// <returns>A list of texts(strings)</returns> | |||
/// <remarks>Only top num_words-1 most frequent words will be taken into account.Only words known by the tokenizer will be taken into account.</remarks> | |||
public IList<string> sequences_to_texts(IEnumerable<int[]> sequences) | |||
{ | |||
return sequences_to_texts_generator(sequences).ToArray(); | |||
} | |||
public IEnumerable<string> sequences_to_texts_generator(IEnumerable<IList<int>> sequences) | |||
{ | |||
int oov_index = -1; | |||
var _ = (oov_token != null) && word_index.TryGetValue(oov_token, out oov_index); | |||
return sequences.Select(seq => | |||
{ | |||
var bldr = new StringBuilder(); | |||
for (var i = 0; i < seq.Count; i++) | |||
{ | |||
if (i > 0) bldr.Append(' '); | |||
string word = null; | |||
if (index_word.TryGetValue(seq[i], out word)) | |||
{ | |||
if (num_words != -1 && i >= num_words) | |||
{ | |||
if (oov_index != -1) | |||
{ | |||
bldr.Append(oov_token); | |||
} | |||
} | |||
else | |||
{ | |||
bldr.Append(word); | |||
} | |||
} | |||
else if (oov_index != -1) | |||
{ | |||
bldr.Append(oov_token); | |||
} | |||
} | |||
return bldr.ToString(); | |||
}); | |||
} | |||
/// <summary> | |||
/// Convert a list of texts to a Numpy matrix. | |||
/// </summary> | |||
/// <param name="texts">A sequence of strings containing one or more tokens.</param> | |||
/// <param name="mode">One of "binary", "count", "tfidf", "freq".</param> | |||
/// <returns></returns> | |||
public NDArray texts_to_matrix(IEnumerable<string> texts, string mode = "binary") | |||
{ | |||
return sequences_to_matrix(texts_to_sequences(texts), mode); | |||
} | |||
/// <summary> | |||
/// Convert a list of texts to a Numpy matrix. | |||
/// </summary> | |||
/// <param name="texts">A sequence of lists of strings, each containing one token.</param> | |||
/// <param name="mode">One of "binary", "count", "tfidf", "freq".</param> | |||
/// <returns></returns> | |||
public NDArray texts_to_matrix(IEnumerable<IList<string>> texts, string mode = "binary") | |||
{ | |||
return sequences_to_matrix(texts_to_sequences(texts), mode); | |||
} | |||
/// <summary> | |||
/// Converts a list of sequences into a Numpy matrix. | |||
/// </summary> | |||
/// <param name="sequences">A sequence of lists of integers, encoding tokens.</param> | |||
/// <param name="mode">One of "binary", "count", "tfidf", "freq".</param> | |||
/// <returns></returns> | |||
public NDArray sequences_to_matrix(IEnumerable<IList<int>> sequences, string mode = "binary") | |||
{ | |||
if (!modes.Contains(mode)) throw new InvalidArgumentError($"Unknown vectorization mode: {mode}"); | |||
var word_count = 0; | |||
if (num_words == -1) | |||
{ | |||
if (word_index != null) | |||
{ | |||
word_count = word_index.Count + 1; | |||
} | |||
else | |||
{ | |||
throw new InvalidOperationException("Specifya dimension ('num_words' arugment), or fit on some text data first."); | |||
} | |||
} | |||
else | |||
{ | |||
word_count = num_words; | |||
} | |||
if (mode == "tfidf" && this.document_count == 0) | |||
{ | |||
throw new InvalidOperationException("Fit the Tokenizer on some text data before using the 'tfidf' mode."); | |||
} | |||
var x = np.zeros(sequences.Count(), word_count); | |||
for (int i = 0; i < sequences.Count(); i++) | |||
{ | |||
var seq = sequences.ElementAt(i); | |||
if (seq == null || seq.Count == 0) | |||
continue; | |||
var counts = new Dictionary<int, int>(); | |||
var seq_length = seq.Count; | |||
foreach (var j in seq) | |||
{ | |||
if (j >= word_count) | |||
continue; | |||
var count = 0; | |||
counts.TryGetValue(j, out count); | |||
counts[j] = count + 1; | |||
} | |||
if (mode == "count") | |||
{ | |||
foreach (var kv in counts) | |||
{ | |||
var j = kv.Key; | |||
var c = kv.Value; | |||
x[i, j] = c; | |||
} | |||
} | |||
else if (mode == "freq") | |||
{ | |||
foreach (var kv in counts) | |||
{ | |||
var j = kv.Key; | |||
var c = kv.Value; | |||
x[i, j] = ((double)c) / seq_length; | |||
} | |||
} | |||
else if (mode == "binary") | |||
{ | |||
foreach (var kv in counts) | |||
{ | |||
var j = kv.Key; | |||
var c = kv.Value; | |||
x[i, j] = 1; | |||
} | |||
} | |||
else if (mode == "tfidf") | |||
{ | |||
foreach (var kv in counts) | |||
{ | |||
var j = kv.Key; | |||
var c = kv.Value; | |||
var id = 0; | |||
var _ = index_docs.TryGetValue(j, out id); | |||
var tf = 1 + np.log(c); | |||
var idf = np.log(1 + document_count / (1 + id)); | |||
x[i, j] = tf * idf; | |||
} | |||
} | |||
} | |||
return x; | |||
} | |||
private string[] modes = new string[] { "binary", "count", "tfidf", "freq" }; | |||
} | |||
} |
@@ -15,7 +15,9 @@ | |||
******************************************************************************/ | |||
using NumSharp; | |||
using NumSharp.Utilities; | |||
using System; | |||
using System.Collections.Generic; | |||
using System.Linq; | |||
namespace Tensorflow.Keras | |||
@@ -34,14 +36,18 @@ namespace Tensorflow.Keras | |||
/// <param name="truncating">String, 'pre' or 'post'</param> | |||
/// <param name="value">Float or String, padding value.</param> | |||
/// <returns></returns> | |||
public NDArray pad_sequences(NDArray sequences, | |||
public NDArray pad_sequences(IEnumerable<int[]> sequences, | |||
int? maxlen = null, | |||
string dtype = "int32", | |||
string padding = "pre", | |||
string truncating = "pre", | |||
object value = null) | |||
{ | |||
int[] length = new int[sequences.size]; | |||
if (value != null) throw new NotImplementedException("padding with a specific value."); | |||
if (padding != "pre" && padding != "post") throw new InvalidArgumentError("padding must be 'pre' or 'post'."); | |||
if (truncating != "pre" && truncating != "post") throw new InvalidArgumentError("truncating must be 'pre' or 'post'."); | |||
var length = sequences.Select(s => s.Length); | |||
if (maxlen == null) | |||
maxlen = length.Max(); | |||
@@ -49,19 +55,26 @@ namespace Tensorflow.Keras | |||
if (value == null) | |||
value = 0f; | |||
var nd = new NDArray(np.int32, new Shape(sequences.size, maxlen.Value)); | |||
#pragma warning disable CS0162 // Unreachable code detected | |||
var type = getNPType(dtype); | |||
var nd = new NDArray(type, new Shape(length.Count(), maxlen.Value), true); | |||
for (int i = 0; i < nd.shape[0]; i++) | |||
#pragma warning restore CS0162 // Unreachable code detected | |||
{ | |||
switch (sequences[i]) | |||
var s = sequences.ElementAt(i); | |||
if (s.Length > maxlen.Value) | |||
{ | |||
default: | |||
throw new NotImplementedException("pad_sequences"); | |||
s = (truncating == "pre") ? s.Slice(s.Length - maxlen.Value, s.Length) : s.Slice(0, maxlen.Value); | |||
} | |||
var sliceString = (padding == "pre") ? $"{i},{maxlen - s.Length}:" : $"{i},:{s.Length}"; | |||
nd[sliceString] = np.array(s); | |||
} | |||
return nd; | |||
} | |||
private Type getNPType(string typeName) | |||
{ | |||
return System.Type.GetType("NumSharp.np,NumSharp").GetField(typeName).GetValue(null) as Type; | |||
} | |||
} | |||
} |
@@ -0,0 +1,35 @@ | |||
using System; | |||
using System.Collections.Generic; | |||
using System.Linq; | |||
using System.Text; | |||
using Tensorflow.Keras.Text; | |||
namespace Tensorflow.Keras | |||
{ | |||
public class TextApi | |||
{ | |||
public Tensorflow.Keras.Text.Tokenizer Tokenizer( | |||
int num_words = -1, | |||
string filters = DefaultFilter, | |||
bool lower = true, | |||
char split = ' ', | |||
bool char_level = false, | |||
string oov_token = null, | |||
Func<string, IEnumerable<string>> analyzer = null) | |||
{ | |||
return new Keras.Text.Tokenizer(num_words, filters, lower, split, char_level, oov_token, analyzer); | |||
} | |||
public static IEnumerable<string> text_to_word_sequence(string text, string filters = DefaultFilter, bool lower = true, char split = ' ') | |||
{ | |||
if (lower) | |||
{ | |||
text = text.ToLower(); | |||
} | |||
var newText = new String(text.Where(c => !filters.Contains(c)).ToArray()); | |||
return newText.Split(split); | |||
} | |||
private const string DefaultFilter = "!\"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n"; | |||
} | |||
} |
@@ -0,0 +1,413 @@ | |||
using Microsoft.VisualStudio.TestTools.UnitTesting; | |||
using System; | |||
using System.Linq; | |||
using System.Collections.Generic; | |||
using System.Text; | |||
using NumSharp; | |||
using static Tensorflow.KerasApi; | |||
using Tensorflow; | |||
using Tensorflow.Keras.Datasets; | |||
using Microsoft.Extensions.DependencyInjection; | |||
namespace TensorFlowNET.Keras.UnitTest | |||
{ | |||
[TestClass] | |||
public class PreprocessingTests : EagerModeTestBase | |||
{ | |||
private readonly string[] texts = new string[] { | |||
"It was the best of times, it was the worst of times.", | |||
"Mr and Mrs Dursley of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much.", | |||
"It was the best of times, it was the worst of times.", | |||
"Mr and Mrs Dursley of number four, Privet Drive.", | |||
}; | |||
private readonly string[][] tokenized_texts = new string[][] { | |||
new string[] {"It","was","the","best","of","times","it","was","the","worst","of","times"}, | |||
new string[] {"mr","and","mrs","dursley","of","number","four","privet","drive","were","proud","to","say","that","they","were","perfectly","normal","thank","you","very","much"}, | |||
new string[] {"It","was","the","best","of","times","it","was","the","worst","of","times"}, | |||
new string[] {"mr","and","mrs","dursley","of","number","four","privet","drive"}, | |||
}; | |||
private readonly string[] processed_texts = new string[] { | |||
"it was the best of times it was the worst of times", | |||
"mr and mrs dursley of number four privet drive were proud to say that they were perfectly normal thank you very much", | |||
"it was the best of times it was the worst of times", | |||
"mr and mrs dursley of number four privet drive", | |||
}; | |||
private const string OOV = "<OOV>"; | |||
[TestMethod] | |||
public void TokenizeWithNoOOV() | |||
{ | |||
var tokenizer = keras.preprocessing.text.Tokenizer(); | |||
tokenizer.fit_on_texts(texts); | |||
Assert.AreEqual(27, tokenizer.word_index.Count); | |||
Assert.AreEqual(7, tokenizer.word_index["worst"]); | |||
Assert.AreEqual(12, tokenizer.word_index["number"]); | |||
Assert.AreEqual(16, tokenizer.word_index["were"]); | |||
} | |||
[TestMethod] | |||
public void TokenizeWithNoOOV_Tkn() | |||
{ | |||
var tokenizer = keras.preprocessing.text.Tokenizer(); | |||
// Use the list version, where the tokenization has already been done. | |||
tokenizer.fit_on_texts(tokenized_texts); | |||
Assert.AreEqual(27, tokenizer.word_index.Count); | |||
Assert.AreEqual(7, tokenizer.word_index["worst"]); | |||
Assert.AreEqual(12, tokenizer.word_index["number"]); | |||
Assert.AreEqual(16, tokenizer.word_index["were"]); | |||
} | |||
[TestMethod] | |||
public void TokenizeWithOOV() | |||
{ | |||
var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV); | |||
tokenizer.fit_on_texts(texts); | |||
Assert.AreEqual(28, tokenizer.word_index.Count); | |||
Assert.AreEqual(1, tokenizer.word_index[OOV]); | |||
Assert.AreEqual(8, tokenizer.word_index["worst"]); | |||
Assert.AreEqual(13, tokenizer.word_index["number"]); | |||
Assert.AreEqual(17, tokenizer.word_index["were"]); | |||
} | |||
[TestMethod] | |||
public void TokenizeWithOOV_Tkn() | |||
{ | |||
var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV); | |||
// Use the list version, where the tokenization has already been done. | |||
tokenizer.fit_on_texts(tokenized_texts); | |||
Assert.AreEqual(28, tokenizer.word_index.Count); | |||
Assert.AreEqual(1, tokenizer.word_index[OOV]); | |||
Assert.AreEqual(8, tokenizer.word_index["worst"]); | |||
Assert.AreEqual(13, tokenizer.word_index["number"]); | |||
Assert.AreEqual(17, tokenizer.word_index["were"]); | |||
} | |||
[TestMethod] | |||
public void TokenizeTextsToSequences() | |||
{ | |||
var tokenizer = keras.preprocessing.text.Tokenizer(); | |||
tokenizer.fit_on_texts(texts); | |||
var sequences = tokenizer.texts_to_sequences(texts); | |||
Assert.AreEqual(4, sequences.Count); | |||
Assert.AreEqual(tokenizer.word_index["worst"], sequences[0][9]); | |||
Assert.AreEqual(tokenizer.word_index["proud"], sequences[1][10]); | |||
} | |||
[TestMethod] | |||
public void TokenizeTextsToSequences_Tkn() | |||
{ | |||
var tokenizer = keras.preprocessing.text.Tokenizer(); | |||
// Use the list version, where the tokenization has already been done. | |||
tokenizer.fit_on_texts(tokenized_texts); | |||
var sequences = tokenizer.texts_to_sequences(tokenized_texts); | |||
Assert.AreEqual(4, sequences.Count); | |||
Assert.AreEqual(tokenizer.word_index["worst"], sequences[0][9]); | |||
Assert.AreEqual(tokenizer.word_index["proud"], sequences[1][10]); | |||
} | |||
[TestMethod] | |||
public void TokenizeTextsToSequencesAndBack() | |||
{ | |||
var tokenizer = keras.preprocessing.text.Tokenizer(); | |||
tokenizer.fit_on_texts(texts); | |||
var sequences = tokenizer.texts_to_sequences(texts); | |||
Assert.AreEqual(4, sequences.Count); | |||
var processed = tokenizer.sequences_to_texts(sequences); | |||
Assert.AreEqual(4, processed.Count); | |||
for (var i = 0; i < processed.Count; i++) | |||
Assert.AreEqual(processed_texts[i], processed[i]); | |||
} | |||
[TestMethod] | |||
public void TokenizeTextsToSequencesAndBack_Tkn1() | |||
{ | |||
var tokenizer = keras.preprocessing.text.Tokenizer(); | |||
// Use the list version, where the tokenization has already been done. | |||
tokenizer.fit_on_texts(tokenized_texts); | |||
// Use the list version, where the tokenization has already been done. | |||
var sequences = tokenizer.texts_to_sequences(tokenized_texts); | |||
Assert.AreEqual(4, sequences.Count); | |||
var processed = tokenizer.sequences_to_texts(sequences); | |||
Assert.AreEqual(4, processed.Count); | |||
for (var i = 0; i < processed.Count; i++) | |||
Assert.AreEqual(processed_texts[i], processed[i]); | |||
} | |||
[TestMethod] | |||
public void TokenizeTextsToSequencesAndBack_Tkn2() | |||
{ | |||
var tokenizer = keras.preprocessing.text.Tokenizer(); | |||
// Use the list version, where the tokenization has already been done. | |||
tokenizer.fit_on_texts(tokenized_texts); | |||
var sequences = tokenizer.texts_to_sequences(texts); | |||
Assert.AreEqual(4, sequences.Count); | |||
var processed = tokenizer.sequences_to_texts(sequences); | |||
Assert.AreEqual(4, processed.Count); | |||
for (var i = 0; i < processed.Count; i++) | |||
Assert.AreEqual(processed_texts[i], processed[i]); | |||
} | |||
[TestMethod] | |||
public void TokenizeTextsToSequencesAndBack_Tkn3() | |||
{ | |||
var tokenizer = keras.preprocessing.text.Tokenizer(); | |||
tokenizer.fit_on_texts(texts); | |||
// Use the list version, where the tokenization has already been done. | |||
var sequences = tokenizer.texts_to_sequences(tokenized_texts); | |||
Assert.AreEqual(4, sequences.Count); | |||
var processed = tokenizer.sequences_to_texts(sequences); | |||
Assert.AreEqual(4, processed.Count); | |||
for (var i = 0; i < processed.Count; i++) | |||
Assert.AreEqual(processed_texts[i], processed[i]); | |||
} | |||
[TestMethod] | |||
public void TokenizeTextsToSequencesWithOOV() | |||
{ | |||
var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV); | |||
tokenizer.fit_on_texts(texts); | |||
var sequences = tokenizer.texts_to_sequences(texts); | |||
Assert.AreEqual(4, sequences.Count); | |||
Assert.AreEqual(tokenizer.word_index["worst"], sequences[0][9]); | |||
Assert.AreEqual(tokenizer.word_index["proud"], sequences[1][10]); | |||
for (var i = 0; i < sequences.Count; i++) | |||
for (var j = 0; j < sequences[i].Length; j++) | |||
Assert.AreNotEqual(tokenizer.word_index[OOV], sequences[i][j]); | |||
} | |||
[TestMethod] | |||
public void TokenizeTextsToSequencesWithOOVPresent() | |||
{ | |||
var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV, num_words:20); | |||
tokenizer.fit_on_texts(texts); | |||
var sequences = tokenizer.texts_to_sequences(texts); | |||
Assert.AreEqual(4, sequences.Count); | |||
Assert.AreEqual(tokenizer.word_index["worst"], sequences[0][9]); | |||
Assert.AreEqual(tokenizer.word_index["proud"], sequences[1][10]); | |||
var oov_count = 0; | |||
for (var i = 0; i < sequences.Count; i++) | |||
for (var j = 0; j < sequences[i].Length; j++) | |||
if (tokenizer.word_index[OOV] == sequences[i][j]) | |||
oov_count += 1; | |||
Assert.AreEqual(9, oov_count); | |||
} | |||
[TestMethod] | |||
public void PadSequencesWithDefaults() | |||
{ | |||
var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV); | |||
tokenizer.fit_on_texts(texts); | |||
var sequences = tokenizer.texts_to_sequences(texts); | |||
var padded = keras.preprocessing.sequence.pad_sequences(sequences); | |||
Assert.AreEqual(4, padded.shape[0]); | |||
Assert.AreEqual(22, padded.shape[1]); | |||
Assert.AreEqual(tokenizer.word_index["worst"], padded[0, 19].GetInt32()); | |||
for (var i = 0; i < 8; i++) | |||
Assert.AreEqual(0, padded[0, i].GetInt32()); | |||
Assert.AreEqual(tokenizer.word_index["proud"], padded[1, 10].GetInt32()); | |||
for (var i = 0; i < 20; i++) | |||
Assert.AreNotEqual(0, padded[1, i].GetInt32()); | |||
} | |||
[TestMethod] | |||
public void PadSequencesPrePaddingTrunc() | |||
{ | |||
var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV); | |||
tokenizer.fit_on_texts(texts); | |||
var sequences = tokenizer.texts_to_sequences(texts); | |||
var padded = keras.preprocessing.sequence.pad_sequences(sequences,maxlen:15); | |||
Assert.AreEqual(4, padded.shape[0]); | |||
Assert.AreEqual(15, padded.shape[1]); | |||
Assert.AreEqual(tokenizer.word_index["worst"], padded[0, 12].GetInt32()); | |||
for (var i = 0; i < 3; i++) | |||
Assert.AreEqual(0, padded[0, i].GetInt32()); | |||
Assert.AreEqual(tokenizer.word_index["proud"], padded[1, 3].GetInt32()); | |||
for (var i = 0; i < 15; i++) | |||
Assert.AreNotEqual(0, padded[1, i].GetInt32()); | |||
} | |||
[TestMethod] | |||
public void PadSequencesPrePaddingTrunc_Larger() | |||
{ | |||
var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV); | |||
tokenizer.fit_on_texts(texts); | |||
var sequences = tokenizer.texts_to_sequences(texts); | |||
var padded = keras.preprocessing.sequence.pad_sequences(sequences, maxlen: 45); | |||
Assert.AreEqual(4, padded.shape[0]); | |||
Assert.AreEqual(45, padded.shape[1]); | |||
Assert.AreEqual(tokenizer.word_index["worst"], padded[0, 42].GetInt32()); | |||
for (var i = 0; i < 33; i++) | |||
Assert.AreEqual(0, padded[0, i].GetInt32()); | |||
Assert.AreEqual(tokenizer.word_index["proud"], padded[1, 33].GetInt32()); | |||
} | |||
[TestMethod] | |||
public void PadSequencesPostPaddingTrunc() | |||
{ | |||
var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV); | |||
tokenizer.fit_on_texts(texts); | |||
var sequences = tokenizer.texts_to_sequences(texts); | |||
var padded = keras.preprocessing.sequence.pad_sequences(sequences, maxlen: 15, padding: "post", truncating: "post"); | |||
Assert.AreEqual(4, padded.shape[0]); | |||
Assert.AreEqual(15, padded.shape[1]); | |||
Assert.AreEqual(tokenizer.word_index["worst"], padded[0, 9].GetInt32()); | |||
for (var i = 12; i < 15; i++) | |||
Assert.AreEqual(0, padded[0, i].GetInt32()); | |||
Assert.AreEqual(tokenizer.word_index["proud"], padded[1, 10].GetInt32()); | |||
for (var i = 0; i < 15; i++) | |||
Assert.AreNotEqual(0, padded[1, i].GetInt32()); | |||
} | |||
[TestMethod] | |||
public void PadSequencesPostPaddingTrunc_Larger() | |||
{ | |||
var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV); | |||
tokenizer.fit_on_texts(texts); | |||
var sequences = tokenizer.texts_to_sequences(texts); | |||
var padded = keras.preprocessing.sequence.pad_sequences(sequences, maxlen: 45, padding: "post", truncating: "post"); | |||
Assert.AreEqual(4, padded.shape[0]); | |||
Assert.AreEqual(45, padded.shape[1]); | |||
Assert.AreEqual(tokenizer.word_index["worst"], padded[0, 9].GetInt32()); | |||
for (var i = 32; i < 45; i++) | |||
Assert.AreEqual(0, padded[0, i].GetInt32()); | |||
Assert.AreEqual(tokenizer.word_index["proud"], padded[1, 10].GetInt32()); | |||
} | |||
[TestMethod] | |||
public void TextToMatrixBinary() | |||
{ | |||
var tokenizer = keras.preprocessing.text.Tokenizer(); | |||
tokenizer.fit_on_texts(texts); | |||
Assert.AreEqual(27, tokenizer.word_index.Count); | |||
var matrix = tokenizer.texts_to_matrix(texts); | |||
Assert.AreEqual(texts.Length, matrix.shape[0]); | |||
CompareLists(new double[] { 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, matrix[0].ToArray<double>()); | |||
CompareLists(new double[] { 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, matrix[1].ToArray<double>()); | |||
} | |||
[TestMethod] | |||
public void TextToMatrixCount() | |||
{ | |||
var tokenizer = keras.preprocessing.text.Tokenizer(); | |||
tokenizer.fit_on_texts(texts); | |||
Assert.AreEqual(27, tokenizer.word_index.Count); | |||
var matrix = tokenizer.texts_to_matrix(texts, mode:"count"); | |||
Assert.AreEqual(texts.Length, matrix.shape[0]); | |||
CompareLists(new double[] { 0, 2, 2, 2, 1, 2, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, matrix[0].ToArray<double>()); | |||
CompareLists(new double[] { 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, matrix[1].ToArray<double>()); | |||
} | |||
[TestMethod] | |||
public void TextToMatrixFrequency() | |||
{ | |||
var tokenizer = keras.preprocessing.text.Tokenizer(); | |||
tokenizer.fit_on_texts(texts); | |||
Assert.AreEqual(27, tokenizer.word_index.Count); | |||
var matrix = tokenizer.texts_to_matrix(texts, mode: "freq"); | |||
Assert.AreEqual(texts.Length, matrix.shape[0]); | |||
double t12 = 2.0 / 12.0; | |||
double o12 = 1.0 / 12.0; | |||
double t22 = 2.0 / 22.0; | |||
double o22 = 1.0 / 22.0; | |||
CompareLists(new double[] { 0, t12, t12, t12, o12, t12, t12, o12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, matrix[0].ToArray<double>()); | |||
CompareLists(new double[] { 0, 0, 0, 0, 0, o22, 0, 0, o22, o22, o22, o22, o22, o22, o22, o22, t22, o22, o22, o22, o22, o22, o22, o22, o22, o22, o22, o22 }, matrix[1].ToArray<double>()); | |||
} | |||
[TestMethod] | |||
public void TextToMatrixTDIDF() | |||
{ | |||
var tokenizer = keras.preprocessing.text.Tokenizer(); | |||
tokenizer.fit_on_texts(texts); | |||
Assert.AreEqual(27, tokenizer.word_index.Count); | |||
var matrix = tokenizer.texts_to_matrix(texts, mode: "tfidf"); | |||
Assert.AreEqual(texts.Length, matrix.shape[0]); | |||
double t1 = 1.1736001944781467; | |||
double t2 = 0.69314718055994529; | |||
double t3 = 1.860112299086919; | |||
double t4 = 1.0986122886681098; | |||
double t5 = 0.69314718055994529; | |||
CompareLists(new double[] { 0, t1, t1, t1, t2, 0, t1, t2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, matrix[0].ToArray<double>()); | |||
CompareLists(new double[] { 0, 0, 0, 0, 0, 0, 0, 0, t5, t5, t5, t5, t5, t5, t5, t5, t3, t4, t4, t4, t4, t4, t4, t4, t4, t4, t4, t4 }, matrix[1].ToArray<double>()); | |||
} | |||
private void CompareLists<T>(IList<T> expected, IList<T> actual) | |||
{ | |||
Assert.AreEqual(expected.Count, actual.Count); | |||
for (var i = 0; i < expected.Count; i++) | |||
{ | |||
Assert.AreEqual(expected[i], actual[i]); | |||
} | |||
} | |||
} | |||
} |