@@ -10,6 +10,10 @@ namespace Tensorflow.Keras | |||||
public Sequence sequence => new Sequence(); | public Sequence sequence => new Sequence(); | ||||
public DatasetUtils dataset_utils => new DatasetUtils(); | public DatasetUtils dataset_utils => new DatasetUtils(); | ||||
public TextApi text => _text; | |||||
private static TextApi _text = new TextApi(); | |||||
public TextVectorization TextVectorization(Func<Tensor, Tensor> standardize = null, | public TextVectorization TextVectorization(Func<Tensor, Tensor> standardize = null, | ||||
string split = "whitespace", | string split = "whitespace", | ||||
int max_tokens = -1, | int max_tokens = -1, | ||||
@@ -0,0 +1,444 @@ | |||||
using NumSharp; | |||||
using Serilog.Debugging; | |||||
using System; | |||||
using System.Collections.Generic; | |||||
using System.Collections.Specialized; | |||||
using System.Data.SqlTypes; | |||||
using System.Linq; | |||||
using System.Net.Sockets; | |||||
using System.Text; | |||||
namespace Tensorflow.Keras.Text | |||||
{ | |||||
/// <summary> | |||||
/// Text tokenization API. | |||||
/// This class allows to vectorize a text corpus, by turning each text into either a sequence of integers | |||||
/// (each integer being the index of a token in a dictionary) or into a vector where the coefficient for | |||||
/// each token could be binary, based on word count, based on tf-idf... | |||||
/// </summary> | |||||
/// <remarks> | |||||
/// This code is a fairly straight port of the Python code for Keras text preprocessing found at: | |||||
/// https://github.com/keras-team/keras-preprocessing/blob/master/keras_preprocessing/text.py | |||||
/// </remarks> | |||||
public class Tokenizer | |||||
{ | |||||
private readonly int num_words; | |||||
private readonly string filters; | |||||
private readonly bool lower; | |||||
private readonly char split; | |||||
private readonly bool char_level; | |||||
private readonly string oov_token; | |||||
private readonly Func<string, IEnumerable<string>> analyzer; | |||||
private int document_count = 0; | |||||
private Dictionary<string, int> word_docs = new Dictionary<string, int>(); | |||||
private Dictionary<string, int> word_counts = new Dictionary<string, int>(); | |||||
public Dictionary<string, int> word_index = null; | |||||
public Dictionary<int, string> index_word = null; | |||||
private Dictionary<int, int> index_docs = null; | |||||
public Tokenizer( | |||||
int num_words = -1, | |||||
string filters = "!\"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n", | |||||
bool lower = true, | |||||
char split = ' ', | |||||
bool char_level = false, | |||||
string oov_token = null, | |||||
Func<string, IEnumerable<string>> analyzer = null) | |||||
{ | |||||
this.num_words = num_words; | |||||
this.filters = filters; | |||||
this.lower = lower; | |||||
this.split = split; | |||||
this.char_level = char_level; | |||||
this.oov_token = oov_token; | |||||
this.analyzer = analyzer != null ? analyzer : (text) => TextApi.text_to_word_sequence(text, filters, lower, split); | |||||
} | |||||
/// <summary> | |||||
/// Updates internal vocabulary based on a list of texts. | |||||
/// </summary> | |||||
/// <param name="texts">A list of strings, each containing one or more tokens.</param> | |||||
/// <remarks>Required before using texts_to_sequences or texts_to_matrix.</remarks> | |||||
public void fit_on_texts(IEnumerable<string> texts) | |||||
{ | |||||
foreach (var text in texts) | |||||
{ | |||||
IEnumerable<string> seq = null; | |||||
document_count += 1; | |||||
if (char_level) | |||||
{ | |||||
throw new NotImplementedException("char_level == true"); | |||||
} | |||||
else | |||||
{ | |||||
seq = analyzer(lower ? text.ToLower() : text); | |||||
} | |||||
foreach (var w in seq) | |||||
{ | |||||
var count = 0; | |||||
word_counts.TryGetValue(w, out count); | |||||
word_counts[w] = count + 1; | |||||
} | |||||
foreach (var w in new HashSet<string>(seq)) | |||||
{ | |||||
var count = 0; | |||||
word_docs.TryGetValue(w, out count); | |||||
word_docs[w] = count + 1; | |||||
} | |||||
} | |||||
var wcounts = word_counts.AsEnumerable().ToList(); | |||||
wcounts.Sort((kv1, kv2) => -kv1.Value.CompareTo(kv2.Value)); // Note: '-' gives us descending order. | |||||
var sorted_voc = (oov_token == null) ? new List<string>() : new List<string>() { oov_token }; | |||||
sorted_voc.AddRange(word_counts.Select(kv => kv.Key)); | |||||
if (num_words > 0 - 1) | |||||
{ | |||||
sorted_voc = sorted_voc.Take<string>((oov_token == null) ? num_words : num_words + 1).ToList(); | |||||
} | |||||
word_index = new Dictionary<string, int>(sorted_voc.Count); | |||||
index_word = new Dictionary<int, string>(sorted_voc.Count); | |||||
index_docs = new Dictionary<int, int>(word_docs.Count); | |||||
for (int i = 0; i < sorted_voc.Count; i++) | |||||
{ | |||||
word_index.Add(sorted_voc[i], i + 1); | |||||
index_word.Add(i + 1, sorted_voc[i]); | |||||
} | |||||
foreach (var kv in word_docs) | |||||
{ | |||||
var idx = -1; | |||||
if (word_index.TryGetValue(kv.Key, out idx)) | |||||
{ | |||||
index_docs.Add(idx, kv.Value); | |||||
} | |||||
} | |||||
} | |||||
/// <summary> | |||||
/// Updates internal vocabulary based on a list of texts. | |||||
/// </summary> | |||||
/// <param name="texts">A list of list of strings, each containing one token.</param> | |||||
/// <remarks>Required before using texts_to_sequences or texts_to_matrix.</remarks> | |||||
public void fit_on_texts(IEnumerable<IEnumerable<string>> texts) | |||||
{ | |||||
foreach (var seq in texts) | |||||
{ | |||||
foreach (var w in seq.Select(s => lower ? s.ToLower() : s)) | |||||
{ | |||||
var count = 0; | |||||
word_counts.TryGetValue(w, out count); | |||||
word_counts[w] = count + 1; | |||||
} | |||||
foreach (var w in new HashSet<string>(word_counts.Keys)) | |||||
{ | |||||
var count = 0; | |||||
word_docs.TryGetValue(w, out count); | |||||
word_docs[w] = count + 1; | |||||
} | |||||
} | |||||
var wcounts = word_counts.AsEnumerable().ToList(); | |||||
wcounts.Sort((kv1, kv2) => -kv1.Value.CompareTo(kv2.Value)); | |||||
var sorted_voc = (oov_token == null) ? new List<string>() : new List<string>() { oov_token }; | |||||
sorted_voc.AddRange(word_counts.Select(kv => kv.Key)); | |||||
if (num_words > 0 - 1) | |||||
{ | |||||
sorted_voc = sorted_voc.Take<string>((oov_token == null) ? num_words : num_words + 1).ToList(); | |||||
} | |||||
word_index = new Dictionary<string, int>(sorted_voc.Count); | |||||
index_word = new Dictionary<int, string>(sorted_voc.Count); | |||||
index_docs = new Dictionary<int, int>(word_docs.Count); | |||||
for (int i = 0; i < sorted_voc.Count; i++) | |||||
{ | |||||
word_index.Add(sorted_voc[i], i + 1); | |||||
index_word.Add(i + 1, sorted_voc[i]); | |||||
} | |||||
foreach (var kv in word_docs) | |||||
{ | |||||
var idx = -1; | |||||
if (word_index.TryGetValue(kv.Key, out idx)) | |||||
{ | |||||
index_docs.Add(idx, kv.Value); | |||||
} | |||||
} | |||||
} | |||||
/// <summary> | |||||
/// Updates internal vocabulary based on a list of sequences. | |||||
/// </summary> | |||||
/// <param name="sequences"></param> | |||||
/// <remarks>Required before using sequences_to_matrix (if fit_on_texts was never called).</remarks> | |||||
public void fit_on_sequences(IEnumerable<int[]> sequences) | |||||
{ | |||||
throw new NotImplementedException("fit_on_sequences"); | |||||
} | |||||
/// <summary> | |||||
/// Transforms each string in texts to a sequence of integers. | |||||
/// </summary> | |||||
/// <param name="texts"></param> | |||||
/// <returns></returns> | |||||
/// <remarks>Only top num_words-1 most frequent words will be taken into account.Only words known by the tokenizer will be taken into account.</remarks> | |||||
public IList<int[]> texts_to_sequences(IEnumerable<string> texts) | |||||
{ | |||||
return texts_to_sequences_generator(texts).ToArray(); | |||||
} | |||||
/// <summary> | |||||
/// Transforms each token in texts to a sequence of integers. | |||||
/// </summary> | |||||
/// <param name="texts"></param> | |||||
/// <returns></returns> | |||||
/// <remarks>Only top num_words-1 most frequent words will be taken into account.Only words known by the tokenizer will be taken into account.</remarks> | |||||
public IList<int[]> texts_to_sequences(IEnumerable<IEnumerable<string>> texts) | |||||
{ | |||||
return texts_to_sequences_generator(texts).ToArray(); | |||||
} | |||||
public IEnumerable<int[]> texts_to_sequences_generator(IEnumerable<string> texts) | |||||
{ | |||||
int oov_index = -1; | |||||
var _ = (oov_token != null) && word_index.TryGetValue(oov_token, out oov_index); | |||||
return texts.Select(text => | |||||
{ | |||||
IEnumerable<string> seq = null; | |||||
if (char_level) | |||||
{ | |||||
throw new NotImplementedException("char_level == true"); | |||||
} | |||||
else | |||||
{ | |||||
seq = analyzer(lower ? text.ToLower() : text); | |||||
} | |||||
return ConvertToSequence(oov_index, seq).ToArray(); | |||||
}); | |||||
} | |||||
public IEnumerable<int[]> texts_to_sequences_generator(IEnumerable<IEnumerable<string>> texts) | |||||
{ | |||||
int oov_index = -1; | |||||
var _ = (oov_token != null) && word_index.TryGetValue(oov_token, out oov_index); | |||||
return texts.Select(seq => ConvertToSequence(oov_index, seq).ToArray()); | |||||
} | |||||
private List<int> ConvertToSequence(int oov_index, IEnumerable<string> seq) | |||||
{ | |||||
var vect = new List<int>(); | |||||
foreach (var w in seq.Select(s => lower ? s.ToLower() : s)) | |||||
{ | |||||
var i = -1; | |||||
if (word_index.TryGetValue(w, out i)) | |||||
{ | |||||
if (num_words != -1 && i >= num_words) | |||||
{ | |||||
if (oov_index != -1) | |||||
{ | |||||
vect.Add(oov_index); | |||||
} | |||||
} | |||||
else | |||||
{ | |||||
vect.Add(i); | |||||
} | |||||
} | |||||
else if (oov_index != -1) | |||||
{ | |||||
vect.Add(oov_index); | |||||
} | |||||
} | |||||
return vect; | |||||
} | |||||
/// <summary> | |||||
/// Transforms each sequence into a list of text. | |||||
/// </summary> | |||||
/// <param name="sequences"></param> | |||||
/// <returns>A list of texts(strings)</returns> | |||||
/// <remarks>Only top num_words-1 most frequent words will be taken into account.Only words known by the tokenizer will be taken into account.</remarks> | |||||
public IList<string> sequences_to_texts(IEnumerable<int[]> sequences) | |||||
{ | |||||
return sequences_to_texts_generator(sequences).ToArray(); | |||||
} | |||||
public IEnumerable<string> sequences_to_texts_generator(IEnumerable<IList<int>> sequences) | |||||
{ | |||||
int oov_index = -1; | |||||
var _ = (oov_token != null) && word_index.TryGetValue(oov_token, out oov_index); | |||||
return sequences.Select(seq => | |||||
{ | |||||
var bldr = new StringBuilder(); | |||||
for (var i = 0; i < seq.Count; i++) | |||||
{ | |||||
if (i > 0) bldr.Append(' '); | |||||
string word = null; | |||||
if (index_word.TryGetValue(seq[i], out word)) | |||||
{ | |||||
if (num_words != -1 && i >= num_words) | |||||
{ | |||||
if (oov_index != -1) | |||||
{ | |||||
bldr.Append(oov_token); | |||||
} | |||||
} | |||||
else | |||||
{ | |||||
bldr.Append(word); | |||||
} | |||||
} | |||||
else if (oov_index != -1) | |||||
{ | |||||
bldr.Append(oov_token); | |||||
} | |||||
} | |||||
return bldr.ToString(); | |||||
}); | |||||
} | |||||
/// <summary> | |||||
/// Convert a list of texts to a Numpy matrix. | |||||
/// </summary> | |||||
/// <param name="texts">A sequence of strings containing one or more tokens.</param> | |||||
/// <param name="mode">One of "binary", "count", "tfidf", "freq".</param> | |||||
/// <returns></returns> | |||||
public NDArray texts_to_matrix(IEnumerable<string> texts, string mode = "binary") | |||||
{ | |||||
return sequences_to_matrix(texts_to_sequences(texts), mode); | |||||
} | |||||
/// <summary> | |||||
/// Convert a list of texts to a Numpy matrix. | |||||
/// </summary> | |||||
/// <param name="texts">A sequence of lists of strings, each containing one token.</param> | |||||
/// <param name="mode">One of "binary", "count", "tfidf", "freq".</param> | |||||
/// <returns></returns> | |||||
public NDArray texts_to_matrix(IEnumerable<IList<string>> texts, string mode = "binary") | |||||
{ | |||||
return sequences_to_matrix(texts_to_sequences(texts), mode); | |||||
} | |||||
/// <summary> | |||||
/// Converts a list of sequences into a Numpy matrix. | |||||
/// </summary> | |||||
/// <param name="sequences">A sequence of lists of integers, encoding tokens.</param> | |||||
/// <param name="mode">One of "binary", "count", "tfidf", "freq".</param> | |||||
/// <returns></returns> | |||||
public NDArray sequences_to_matrix(IEnumerable<IList<int>> sequences, string mode = "binary") | |||||
{ | |||||
if (!modes.Contains(mode)) throw new InvalidArgumentError($"Unknown vectorization mode: {mode}"); | |||||
var word_count = 0; | |||||
if (num_words == -1) | |||||
{ | |||||
if (word_index != null) | |||||
{ | |||||
word_count = word_index.Count + 1; | |||||
} | |||||
else | |||||
{ | |||||
throw new InvalidOperationException("Specifya dimension ('num_words' arugment), or fit on some text data first."); | |||||
} | |||||
} | |||||
else | |||||
{ | |||||
word_count = num_words; | |||||
} | |||||
if (mode == "tfidf" && this.document_count == 0) | |||||
{ | |||||
throw new InvalidOperationException("Fit the Tokenizer on some text data before using the 'tfidf' mode."); | |||||
} | |||||
var x = np.zeros(sequences.Count(), word_count); | |||||
for (int i = 0; i < sequences.Count(); i++) | |||||
{ | |||||
var seq = sequences.ElementAt(i); | |||||
if (seq == null || seq.Count == 0) | |||||
continue; | |||||
var counts = new Dictionary<int, int>(); | |||||
var seq_length = seq.Count; | |||||
foreach (var j in seq) | |||||
{ | |||||
if (j >= word_count) | |||||
continue; | |||||
var count = 0; | |||||
counts.TryGetValue(j, out count); | |||||
counts[j] = count + 1; | |||||
} | |||||
if (mode == "count") | |||||
{ | |||||
foreach (var kv in counts) | |||||
{ | |||||
var j = kv.Key; | |||||
var c = kv.Value; | |||||
x[i, j] = c; | |||||
} | |||||
} | |||||
else if (mode == "freq") | |||||
{ | |||||
foreach (var kv in counts) | |||||
{ | |||||
var j = kv.Key; | |||||
var c = kv.Value; | |||||
x[i, j] = ((double)c) / seq_length; | |||||
} | |||||
} | |||||
else if (mode == "binary") | |||||
{ | |||||
foreach (var kv in counts) | |||||
{ | |||||
var j = kv.Key; | |||||
var c = kv.Value; | |||||
x[i, j] = 1; | |||||
} | |||||
} | |||||
else if (mode == "tfidf") | |||||
{ | |||||
foreach (var kv in counts) | |||||
{ | |||||
var j = kv.Key; | |||||
var c = kv.Value; | |||||
var id = 0; | |||||
var _ = index_docs.TryGetValue(j, out id); | |||||
var tf = 1 + np.log(c); | |||||
var idf = np.log(1 + document_count / (1 + id)); | |||||
x[i, j] = tf * idf; | |||||
} | |||||
} | |||||
} | |||||
return x; | |||||
} | |||||
private string[] modes = new string[] { "binary", "count", "tfidf", "freq" }; | |||||
} | |||||
} |
@@ -15,7 +15,9 @@ | |||||
******************************************************************************/ | ******************************************************************************/ | ||||
using NumSharp; | using NumSharp; | ||||
using NumSharp.Utilities; | |||||
using System; | using System; | ||||
using System.Collections.Generic; | |||||
using System.Linq; | using System.Linq; | ||||
namespace Tensorflow.Keras | namespace Tensorflow.Keras | ||||
@@ -34,14 +36,18 @@ namespace Tensorflow.Keras | |||||
/// <param name="truncating">String, 'pre' or 'post'</param> | /// <param name="truncating">String, 'pre' or 'post'</param> | ||||
/// <param name="value">Float or String, padding value.</param> | /// <param name="value">Float or String, padding value.</param> | ||||
/// <returns></returns> | /// <returns></returns> | ||||
public NDArray pad_sequences(NDArray sequences, | |||||
public NDArray pad_sequences(IEnumerable<int[]> sequences, | |||||
int? maxlen = null, | int? maxlen = null, | ||||
string dtype = "int32", | string dtype = "int32", | ||||
string padding = "pre", | string padding = "pre", | ||||
string truncating = "pre", | string truncating = "pre", | ||||
object value = null) | object value = null) | ||||
{ | { | ||||
int[] length = new int[sequences.size]; | |||||
if (value != null) throw new NotImplementedException("padding with a specific value."); | |||||
if (padding != "pre" && padding != "post") throw new InvalidArgumentError("padding must be 'pre' or 'post'."); | |||||
if (truncating != "pre" && truncating != "post") throw new InvalidArgumentError("truncating must be 'pre' or 'post'."); | |||||
var length = sequences.Select(s => s.Length); | |||||
if (maxlen == null) | if (maxlen == null) | ||||
maxlen = length.Max(); | maxlen = length.Max(); | ||||
@@ -49,19 +55,26 @@ namespace Tensorflow.Keras | |||||
if (value == null) | if (value == null) | ||||
value = 0f; | value = 0f; | ||||
var nd = new NDArray(np.int32, new Shape(sequences.size, maxlen.Value)); | |||||
#pragma warning disable CS0162 // Unreachable code detected | |||||
var type = getNPType(dtype); | |||||
var nd = new NDArray(type, new Shape(length.Count(), maxlen.Value), true); | |||||
for (int i = 0; i < nd.shape[0]; i++) | for (int i = 0; i < nd.shape[0]; i++) | ||||
#pragma warning restore CS0162 // Unreachable code detected | |||||
{ | { | ||||
switch (sequences[i]) | |||||
var s = sequences.ElementAt(i); | |||||
if (s.Length > maxlen.Value) | |||||
{ | { | ||||
default: | |||||
throw new NotImplementedException("pad_sequences"); | |||||
s = (truncating == "pre") ? s.Slice(s.Length - maxlen.Value, s.Length) : s.Slice(0, maxlen.Value); | |||||
} | } | ||||
var sliceString = (padding == "pre") ? $"{i},{maxlen - s.Length}:" : $"{i},:{s.Length}"; | |||||
nd[sliceString] = np.array(s); | |||||
} | } | ||||
return nd; | return nd; | ||||
} | } | ||||
private Type getNPType(string typeName) | |||||
{ | |||||
return System.Type.GetType("NumSharp.np,NumSharp").GetField(typeName).GetValue(null) as Type; | |||||
} | |||||
} | } | ||||
} | } |
@@ -0,0 +1,35 @@ | |||||
using System; | |||||
using System.Collections.Generic; | |||||
using System.Linq; | |||||
using System.Text; | |||||
using Tensorflow.Keras.Text; | |||||
namespace Tensorflow.Keras | |||||
{ | |||||
public class TextApi | |||||
{ | |||||
public Tensorflow.Keras.Text.Tokenizer Tokenizer( | |||||
int num_words = -1, | |||||
string filters = DefaultFilter, | |||||
bool lower = true, | |||||
char split = ' ', | |||||
bool char_level = false, | |||||
string oov_token = null, | |||||
Func<string, IEnumerable<string>> analyzer = null) | |||||
{ | |||||
return new Keras.Text.Tokenizer(num_words, filters, lower, split, char_level, oov_token, analyzer); | |||||
} | |||||
public static IEnumerable<string> text_to_word_sequence(string text, string filters = DefaultFilter, bool lower = true, char split = ' ') | |||||
{ | |||||
if (lower) | |||||
{ | |||||
text = text.ToLower(); | |||||
} | |||||
var newText = new String(text.Where(c => !filters.Contains(c)).ToArray()); | |||||
return newText.Split(split); | |||||
} | |||||
private const string DefaultFilter = "!\"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n"; | |||||
} | |||||
} |
@@ -0,0 +1,413 @@ | |||||
using Microsoft.VisualStudio.TestTools.UnitTesting; | |||||
using System; | |||||
using System.Linq; | |||||
using System.Collections.Generic; | |||||
using System.Text; | |||||
using NumSharp; | |||||
using static Tensorflow.KerasApi; | |||||
using Tensorflow; | |||||
using Tensorflow.Keras.Datasets; | |||||
using Microsoft.Extensions.DependencyInjection; | |||||
namespace TensorFlowNET.Keras.UnitTest | |||||
{ | |||||
[TestClass] | |||||
public class PreprocessingTests : EagerModeTestBase | |||||
{ | |||||
private readonly string[] texts = new string[] { | |||||
"It was the best of times, it was the worst of times.", | |||||
"Mr and Mrs Dursley of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much.", | |||||
"It was the best of times, it was the worst of times.", | |||||
"Mr and Mrs Dursley of number four, Privet Drive.", | |||||
}; | |||||
private readonly string[][] tokenized_texts = new string[][] { | |||||
new string[] {"It","was","the","best","of","times","it","was","the","worst","of","times"}, | |||||
new string[] {"mr","and","mrs","dursley","of","number","four","privet","drive","were","proud","to","say","that","they","were","perfectly","normal","thank","you","very","much"}, | |||||
new string[] {"It","was","the","best","of","times","it","was","the","worst","of","times"}, | |||||
new string[] {"mr","and","mrs","dursley","of","number","four","privet","drive"}, | |||||
}; | |||||
private readonly string[] processed_texts = new string[] { | |||||
"it was the best of times it was the worst of times", | |||||
"mr and mrs dursley of number four privet drive were proud to say that they were perfectly normal thank you very much", | |||||
"it was the best of times it was the worst of times", | |||||
"mr and mrs dursley of number four privet drive", | |||||
}; | |||||
private const string OOV = "<OOV>"; | |||||
[TestMethod] | |||||
public void TokenizeWithNoOOV() | |||||
{ | |||||
var tokenizer = keras.preprocessing.text.Tokenizer(); | |||||
tokenizer.fit_on_texts(texts); | |||||
Assert.AreEqual(27, tokenizer.word_index.Count); | |||||
Assert.AreEqual(7, tokenizer.word_index["worst"]); | |||||
Assert.AreEqual(12, tokenizer.word_index["number"]); | |||||
Assert.AreEqual(16, tokenizer.word_index["were"]); | |||||
} | |||||
[TestMethod] | |||||
public void TokenizeWithNoOOV_Tkn() | |||||
{ | |||||
var tokenizer = keras.preprocessing.text.Tokenizer(); | |||||
// Use the list version, where the tokenization has already been done. | |||||
tokenizer.fit_on_texts(tokenized_texts); | |||||
Assert.AreEqual(27, tokenizer.word_index.Count); | |||||
Assert.AreEqual(7, tokenizer.word_index["worst"]); | |||||
Assert.AreEqual(12, tokenizer.word_index["number"]); | |||||
Assert.AreEqual(16, tokenizer.word_index["were"]); | |||||
} | |||||
[TestMethod] | |||||
public void TokenizeWithOOV() | |||||
{ | |||||
var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV); | |||||
tokenizer.fit_on_texts(texts); | |||||
Assert.AreEqual(28, tokenizer.word_index.Count); | |||||
Assert.AreEqual(1, tokenizer.word_index[OOV]); | |||||
Assert.AreEqual(8, tokenizer.word_index["worst"]); | |||||
Assert.AreEqual(13, tokenizer.word_index["number"]); | |||||
Assert.AreEqual(17, tokenizer.word_index["were"]); | |||||
} | |||||
[TestMethod] | |||||
public void TokenizeWithOOV_Tkn() | |||||
{ | |||||
var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV); | |||||
// Use the list version, where the tokenization has already been done. | |||||
tokenizer.fit_on_texts(tokenized_texts); | |||||
Assert.AreEqual(28, tokenizer.word_index.Count); | |||||
Assert.AreEqual(1, tokenizer.word_index[OOV]); | |||||
Assert.AreEqual(8, tokenizer.word_index["worst"]); | |||||
Assert.AreEqual(13, tokenizer.word_index["number"]); | |||||
Assert.AreEqual(17, tokenizer.word_index["were"]); | |||||
} | |||||
[TestMethod] | |||||
public void TokenizeTextsToSequences() | |||||
{ | |||||
var tokenizer = keras.preprocessing.text.Tokenizer(); | |||||
tokenizer.fit_on_texts(texts); | |||||
var sequences = tokenizer.texts_to_sequences(texts); | |||||
Assert.AreEqual(4, sequences.Count); | |||||
Assert.AreEqual(tokenizer.word_index["worst"], sequences[0][9]); | |||||
Assert.AreEqual(tokenizer.word_index["proud"], sequences[1][10]); | |||||
} | |||||
[TestMethod] | |||||
public void TokenizeTextsToSequences_Tkn() | |||||
{ | |||||
var tokenizer = keras.preprocessing.text.Tokenizer(); | |||||
// Use the list version, where the tokenization has already been done. | |||||
tokenizer.fit_on_texts(tokenized_texts); | |||||
var sequences = tokenizer.texts_to_sequences(tokenized_texts); | |||||
Assert.AreEqual(4, sequences.Count); | |||||
Assert.AreEqual(tokenizer.word_index["worst"], sequences[0][9]); | |||||
Assert.AreEqual(tokenizer.word_index["proud"], sequences[1][10]); | |||||
} | |||||
[TestMethod] | |||||
public void TokenizeTextsToSequencesAndBack() | |||||
{ | |||||
var tokenizer = keras.preprocessing.text.Tokenizer(); | |||||
tokenizer.fit_on_texts(texts); | |||||
var sequences = tokenizer.texts_to_sequences(texts); | |||||
Assert.AreEqual(4, sequences.Count); | |||||
var processed = tokenizer.sequences_to_texts(sequences); | |||||
Assert.AreEqual(4, processed.Count); | |||||
for (var i = 0; i < processed.Count; i++) | |||||
Assert.AreEqual(processed_texts[i], processed[i]); | |||||
} | |||||
[TestMethod] | |||||
public void TokenizeTextsToSequencesAndBack_Tkn1() | |||||
{ | |||||
var tokenizer = keras.preprocessing.text.Tokenizer(); | |||||
// Use the list version, where the tokenization has already been done. | |||||
tokenizer.fit_on_texts(tokenized_texts); | |||||
// Use the list version, where the tokenization has already been done. | |||||
var sequences = tokenizer.texts_to_sequences(tokenized_texts); | |||||
Assert.AreEqual(4, sequences.Count); | |||||
var processed = tokenizer.sequences_to_texts(sequences); | |||||
Assert.AreEqual(4, processed.Count); | |||||
for (var i = 0; i < processed.Count; i++) | |||||
Assert.AreEqual(processed_texts[i], processed[i]); | |||||
} | |||||
[TestMethod] | |||||
public void TokenizeTextsToSequencesAndBack_Tkn2() | |||||
{ | |||||
var tokenizer = keras.preprocessing.text.Tokenizer(); | |||||
// Use the list version, where the tokenization has already been done. | |||||
tokenizer.fit_on_texts(tokenized_texts); | |||||
var sequences = tokenizer.texts_to_sequences(texts); | |||||
Assert.AreEqual(4, sequences.Count); | |||||
var processed = tokenizer.sequences_to_texts(sequences); | |||||
Assert.AreEqual(4, processed.Count); | |||||
for (var i = 0; i < processed.Count; i++) | |||||
Assert.AreEqual(processed_texts[i], processed[i]); | |||||
} | |||||
[TestMethod] | |||||
public void TokenizeTextsToSequencesAndBack_Tkn3() | |||||
{ | |||||
var tokenizer = keras.preprocessing.text.Tokenizer(); | |||||
tokenizer.fit_on_texts(texts); | |||||
// Use the list version, where the tokenization has already been done. | |||||
var sequences = tokenizer.texts_to_sequences(tokenized_texts); | |||||
Assert.AreEqual(4, sequences.Count); | |||||
var processed = tokenizer.sequences_to_texts(sequences); | |||||
Assert.AreEqual(4, processed.Count); | |||||
for (var i = 0; i < processed.Count; i++) | |||||
Assert.AreEqual(processed_texts[i], processed[i]); | |||||
} | |||||
[TestMethod] | |||||
public void TokenizeTextsToSequencesWithOOV() | |||||
{ | |||||
var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV); | |||||
tokenizer.fit_on_texts(texts); | |||||
var sequences = tokenizer.texts_to_sequences(texts); | |||||
Assert.AreEqual(4, sequences.Count); | |||||
Assert.AreEqual(tokenizer.word_index["worst"], sequences[0][9]); | |||||
Assert.AreEqual(tokenizer.word_index["proud"], sequences[1][10]); | |||||
for (var i = 0; i < sequences.Count; i++) | |||||
for (var j = 0; j < sequences[i].Length; j++) | |||||
Assert.AreNotEqual(tokenizer.word_index[OOV], sequences[i][j]); | |||||
} | |||||
[TestMethod] | |||||
public void TokenizeTextsToSequencesWithOOVPresent() | |||||
{ | |||||
var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV, num_words:20); | |||||
tokenizer.fit_on_texts(texts); | |||||
var sequences = tokenizer.texts_to_sequences(texts); | |||||
Assert.AreEqual(4, sequences.Count); | |||||
Assert.AreEqual(tokenizer.word_index["worst"], sequences[0][9]); | |||||
Assert.AreEqual(tokenizer.word_index["proud"], sequences[1][10]); | |||||
var oov_count = 0; | |||||
for (var i = 0; i < sequences.Count; i++) | |||||
for (var j = 0; j < sequences[i].Length; j++) | |||||
if (tokenizer.word_index[OOV] == sequences[i][j]) | |||||
oov_count += 1; | |||||
Assert.AreEqual(9, oov_count); | |||||
} | |||||
[TestMethod] | |||||
public void PadSequencesWithDefaults() | |||||
{ | |||||
var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV); | |||||
tokenizer.fit_on_texts(texts); | |||||
var sequences = tokenizer.texts_to_sequences(texts); | |||||
var padded = keras.preprocessing.sequence.pad_sequences(sequences); | |||||
Assert.AreEqual(4, padded.shape[0]); | |||||
Assert.AreEqual(22, padded.shape[1]); | |||||
Assert.AreEqual(tokenizer.word_index["worst"], padded[0, 19].GetInt32()); | |||||
for (var i = 0; i < 8; i++) | |||||
Assert.AreEqual(0, padded[0, i].GetInt32()); | |||||
Assert.AreEqual(tokenizer.word_index["proud"], padded[1, 10].GetInt32()); | |||||
for (var i = 0; i < 20; i++) | |||||
Assert.AreNotEqual(0, padded[1, i].GetInt32()); | |||||
} | |||||
[TestMethod] | |||||
public void PadSequencesPrePaddingTrunc() | |||||
{ | |||||
var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV); | |||||
tokenizer.fit_on_texts(texts); | |||||
var sequences = tokenizer.texts_to_sequences(texts); | |||||
var padded = keras.preprocessing.sequence.pad_sequences(sequences,maxlen:15); | |||||
Assert.AreEqual(4, padded.shape[0]); | |||||
Assert.AreEqual(15, padded.shape[1]); | |||||
Assert.AreEqual(tokenizer.word_index["worst"], padded[0, 12].GetInt32()); | |||||
for (var i = 0; i < 3; i++) | |||||
Assert.AreEqual(0, padded[0, i].GetInt32()); | |||||
Assert.AreEqual(tokenizer.word_index["proud"], padded[1, 3].GetInt32()); | |||||
for (var i = 0; i < 15; i++) | |||||
Assert.AreNotEqual(0, padded[1, i].GetInt32()); | |||||
} | |||||
[TestMethod] | |||||
public void PadSequencesPrePaddingTrunc_Larger() | |||||
{ | |||||
var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV); | |||||
tokenizer.fit_on_texts(texts); | |||||
var sequences = tokenizer.texts_to_sequences(texts); | |||||
var padded = keras.preprocessing.sequence.pad_sequences(sequences, maxlen: 45); | |||||
Assert.AreEqual(4, padded.shape[0]); | |||||
Assert.AreEqual(45, padded.shape[1]); | |||||
Assert.AreEqual(tokenizer.word_index["worst"], padded[0, 42].GetInt32()); | |||||
for (var i = 0; i < 33; i++) | |||||
Assert.AreEqual(0, padded[0, i].GetInt32()); | |||||
Assert.AreEqual(tokenizer.word_index["proud"], padded[1, 33].GetInt32()); | |||||
} | |||||
[TestMethod] | |||||
public void PadSequencesPostPaddingTrunc() | |||||
{ | |||||
var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV); | |||||
tokenizer.fit_on_texts(texts); | |||||
var sequences = tokenizer.texts_to_sequences(texts); | |||||
var padded = keras.preprocessing.sequence.pad_sequences(sequences, maxlen: 15, padding: "post", truncating: "post"); | |||||
Assert.AreEqual(4, padded.shape[0]); | |||||
Assert.AreEqual(15, padded.shape[1]); | |||||
Assert.AreEqual(tokenizer.word_index["worst"], padded[0, 9].GetInt32()); | |||||
for (var i = 12; i < 15; i++) | |||||
Assert.AreEqual(0, padded[0, i].GetInt32()); | |||||
Assert.AreEqual(tokenizer.word_index["proud"], padded[1, 10].GetInt32()); | |||||
for (var i = 0; i < 15; i++) | |||||
Assert.AreNotEqual(0, padded[1, i].GetInt32()); | |||||
} | |||||
[TestMethod] | |||||
public void PadSequencesPostPaddingTrunc_Larger() | |||||
{ | |||||
var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV); | |||||
tokenizer.fit_on_texts(texts); | |||||
var sequences = tokenizer.texts_to_sequences(texts); | |||||
var padded = keras.preprocessing.sequence.pad_sequences(sequences, maxlen: 45, padding: "post", truncating: "post"); | |||||
Assert.AreEqual(4, padded.shape[0]); | |||||
Assert.AreEqual(45, padded.shape[1]); | |||||
Assert.AreEqual(tokenizer.word_index["worst"], padded[0, 9].GetInt32()); | |||||
for (var i = 32; i < 45; i++) | |||||
Assert.AreEqual(0, padded[0, i].GetInt32()); | |||||
Assert.AreEqual(tokenizer.word_index["proud"], padded[1, 10].GetInt32()); | |||||
} | |||||
[TestMethod] | |||||
public void TextToMatrixBinary() | |||||
{ | |||||
var tokenizer = keras.preprocessing.text.Tokenizer(); | |||||
tokenizer.fit_on_texts(texts); | |||||
Assert.AreEqual(27, tokenizer.word_index.Count); | |||||
var matrix = tokenizer.texts_to_matrix(texts); | |||||
Assert.AreEqual(texts.Length, matrix.shape[0]); | |||||
CompareLists(new double[] { 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, matrix[0].ToArray<double>()); | |||||
CompareLists(new double[] { 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, matrix[1].ToArray<double>()); | |||||
} | |||||
[TestMethod] | |||||
public void TextToMatrixCount() | |||||
{ | |||||
var tokenizer = keras.preprocessing.text.Tokenizer(); | |||||
tokenizer.fit_on_texts(texts); | |||||
Assert.AreEqual(27, tokenizer.word_index.Count); | |||||
var matrix = tokenizer.texts_to_matrix(texts, mode:"count"); | |||||
Assert.AreEqual(texts.Length, matrix.shape[0]); | |||||
CompareLists(new double[] { 0, 2, 2, 2, 1, 2, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, matrix[0].ToArray<double>()); | |||||
CompareLists(new double[] { 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, matrix[1].ToArray<double>()); | |||||
} | |||||
[TestMethod] | |||||
public void TextToMatrixFrequency() | |||||
{ | |||||
var tokenizer = keras.preprocessing.text.Tokenizer(); | |||||
tokenizer.fit_on_texts(texts); | |||||
Assert.AreEqual(27, tokenizer.word_index.Count); | |||||
var matrix = tokenizer.texts_to_matrix(texts, mode: "freq"); | |||||
Assert.AreEqual(texts.Length, matrix.shape[0]); | |||||
double t12 = 2.0 / 12.0; | |||||
double o12 = 1.0 / 12.0; | |||||
double t22 = 2.0 / 22.0; | |||||
double o22 = 1.0 / 22.0; | |||||
CompareLists(new double[] { 0, t12, t12, t12, o12, t12, t12, o12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, matrix[0].ToArray<double>()); | |||||
CompareLists(new double[] { 0, 0, 0, 0, 0, o22, 0, 0, o22, o22, o22, o22, o22, o22, o22, o22, t22, o22, o22, o22, o22, o22, o22, o22, o22, o22, o22, o22 }, matrix[1].ToArray<double>()); | |||||
} | |||||
[TestMethod] | |||||
public void TextToMatrixTDIDF() | |||||
{ | |||||
var tokenizer = keras.preprocessing.text.Tokenizer(); | |||||
tokenizer.fit_on_texts(texts); | |||||
Assert.AreEqual(27, tokenizer.word_index.Count); | |||||
var matrix = tokenizer.texts_to_matrix(texts, mode: "tfidf"); | |||||
Assert.AreEqual(texts.Length, matrix.shape[0]); | |||||
double t1 = 1.1736001944781467; | |||||
double t2 = 0.69314718055994529; | |||||
double t3 = 1.860112299086919; | |||||
double t4 = 1.0986122886681098; | |||||
double t5 = 0.69314718055994529; | |||||
CompareLists(new double[] { 0, t1, t1, t1, t2, 0, t1, t2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, matrix[0].ToArray<double>()); | |||||
CompareLists(new double[] { 0, 0, 0, 0, 0, 0, 0, 0, t5, t5, t5, t5, t5, t5, t5, t5, t3, t4, t4, t4, t4, t4, t4, t4, t4, t4, t4, t4 }, matrix[1].ToArray<double>()); | |||||
} | |||||
private void CompareLists<T>(IList<T> expected, IList<T> actual) | |||||
{ | |||||
Assert.AreEqual(expected.Count, actual.Count); | |||||
for (var i = 0; i < expected.Count; i++) | |||||
{ | |||||
Assert.AreEqual(expected[i], actual[i]); | |||||
} | |||||
} | |||||
} | |||||
} |