Browse Source

Implemented text_to_matrix() and unit tests.

pull/756/head
Niklas Gustafsson 4 years ago
parent
commit
12f4f230f8
2 changed files with 202 additions and 12 deletions
  1. +116
    -3
      src/TensorFlowNET.Keras/Preprocessings/Tokenizer.cs
  2. +86
    -9
      test/TensorFlowNET.Keras.UnitTest/PreprocessingTests.cs

+ 116
- 3
src/TensorFlowNET.Keras/Preprocessings/Tokenizer.cs View File

@@ -3,6 +3,7 @@ using Serilog.Debugging;
using System; using System;
using System.Collections.Generic; using System.Collections.Generic;
using System.Collections.Specialized; using System.Collections.Specialized;
using System.Data.SqlTypes;
using System.Linq; using System.Linq;
using System.Net.Sockets; using System.Net.Sockets;
using System.Text; using System.Text;
@@ -314,14 +315,126 @@ namespace Tensorflow.Keras.Text
}); });
} }


/// <summary>
/// Convert a list of texts to a Numpy matrix.
/// </summary>
/// <param name="texts">A sequence of strings containing one or more tokens.</param>
/// <param name="mode">One of "binary", "count", "tfidf", "freq".</param>
/// <returns></returns>
public NDArray texts_to_matrix(IEnumerable<string> texts, string mode = "binary")
{
return sequences_to_matrix(texts_to_sequences(texts), mode);
}

/// <summary>
/// Convert a list of texts to a Numpy matrix.
/// </summary>
/// <param name="texts">A sequence of lists of strings, each containing one token.</param>
/// <param name="mode">One of "binary", "count", "tfidf", "freq".</param>
/// <returns></returns>
public NDArray texts_to_matrix(IEnumerable<IList<string>> texts, string mode = "binary")
{
return sequences_to_matrix(texts_to_sequences(texts), mode);
}

/// <summary> /// <summary>
/// Converts a list of sequences into a Numpy matrix. /// Converts a list of sequences into a Numpy matrix.
/// </summary> /// </summary>
/// <param name="sequences"></param>
/// <param name="sequences">A sequence of lists of integers, encoding tokens.</param>
/// <param name="mode">One of "binary", "count", "tfidf", "freq".</param>
/// <returns></returns> /// <returns></returns>
public NDArray sequences_to_matrix(IEnumerable<IList<int>> sequences)
public NDArray sequences_to_matrix(IEnumerable<IList<int>> sequences, string mode = "binary")
{ {
throw new NotImplementedException("sequences_to_matrix");
if (!modes.Contains(mode)) throw new InvalidArgumentError($"Unknown vectorization mode: {mode}");
var word_count = 0;

if (num_words == -1)
{
if (word_index != null)
{
word_count = word_index.Count + 1;
}
else
{
throw new InvalidOperationException("Specifya dimension ('num_words' arugment), or fit on some text data first.");
}
}
else
{
word_count = num_words;
}

if (mode == "tfidf" && this.document_count == 0)
{
throw new InvalidOperationException("Fit the Tokenizer on some text data before using the 'tfidf' mode.");
}

var x = np.zeros(sequences.Count(), word_count);

for (int i = 0; i < sequences.Count(); i++)
{
var seq = sequences.ElementAt(i);
if (seq == null || seq.Count == 0)
continue;

var counts = new Dictionary<int, int>();

var seq_length = seq.Count;

foreach (var j in seq)
{
if (j >= word_count)
continue;
var count = 0;
counts.TryGetValue(j, out count);
counts[j] = count + 1;
}

if (mode == "count")
{
foreach (var kv in counts)
{
var j = kv.Key;
var c = kv.Value;
x[i, j] = c;
}
}
else if (mode == "freq")
{
foreach (var kv in counts)
{
var j = kv.Key;
var c = kv.Value;
x[i, j] = ((double)c) / seq_length;
}
}
else if (mode == "binary")
{
foreach (var kv in counts)
{
var j = kv.Key;
var c = kv.Value;
x[i, j] = 1;
}
}
else if (mode == "tfidf")
{
foreach (var kv in counts)
{
var j = kv.Key;
var c = kv.Value;
var id = 0;
var _ = index_docs.TryGetValue(j, out id);
var tf = 1 + np.log(c);
var idf = np.log(1 + document_count / (1 + id));
x[i, j] = tf * idf;
}
}
}

return x;
} }

private string[] modes = new string[] { "binary", "count", "tfidf", "freq" };
} }
} }

+ 86
- 9
test/TensorFlowNET.Keras.UnitTest/PreprocessingTests.cs View File

@@ -7,6 +7,7 @@ using NumSharp;
using static Tensorflow.KerasApi; using static Tensorflow.KerasApi;
using Tensorflow; using Tensorflow;
using Tensorflow.Keras.Datasets; using Tensorflow.Keras.Datasets;
using Microsoft.Extensions.DependencyInjection;


namespace TensorFlowNET.Keras.UnitTest namespace TensorFlowNET.Keras.UnitTest
{ {
@@ -240,9 +241,6 @@ namespace TensorFlowNET.Keras.UnitTest
Assert.AreEqual(4, padded.shape[0]); Assert.AreEqual(4, padded.shape[0]);
Assert.AreEqual(22, padded.shape[1]); Assert.AreEqual(22, padded.shape[1]);


var firstRow = padded[0];
var secondRow = padded[1];

Assert.AreEqual(tokenizer.word_index["worst"], padded[0, 19].GetInt32()); Assert.AreEqual(tokenizer.word_index["worst"], padded[0, 19].GetInt32());
for (var i = 0; i < 8; i++) for (var i = 0; i < 8; i++)
Assert.AreEqual(0, padded[0, i].GetInt32()); Assert.AreEqual(0, padded[0, i].GetInt32());
@@ -263,9 +261,6 @@ namespace TensorFlowNET.Keras.UnitTest
Assert.AreEqual(4, padded.shape[0]); Assert.AreEqual(4, padded.shape[0]);
Assert.AreEqual(15, padded.shape[1]); Assert.AreEqual(15, padded.shape[1]);


var firstRow = padded[0];
var secondRow = padded[1];

Assert.AreEqual(tokenizer.word_index["worst"], padded[0, 12].GetInt32()); Assert.AreEqual(tokenizer.word_index["worst"], padded[0, 12].GetInt32());
for (var i = 0; i < 3; i++) for (var i = 0; i < 3; i++)
Assert.AreEqual(0, padded[0, i].GetInt32()); Assert.AreEqual(0, padded[0, i].GetInt32());
@@ -286,9 +281,6 @@ namespace TensorFlowNET.Keras.UnitTest
Assert.AreEqual(4, padded.shape[0]); Assert.AreEqual(4, padded.shape[0]);
Assert.AreEqual(15, padded.shape[1]); Assert.AreEqual(15, padded.shape[1]);


var firstRow = padded[0];
var secondRow = padded[1];

Assert.AreEqual(tokenizer.word_index["worst"], padded[0, 9].GetInt32()); Assert.AreEqual(tokenizer.word_index["worst"], padded[0, 9].GetInt32());
for (var i = 12; i < 15; i++) for (var i = 12; i < 15; i++)
Assert.AreEqual(0, padded[0, i].GetInt32()); Assert.AreEqual(0, padded[0, i].GetInt32());
@@ -296,5 +288,90 @@ namespace TensorFlowNET.Keras.UnitTest
for (var i = 0; i < 15; i++) for (var i = 0; i < 15; i++)
Assert.AreNotEqual(0, padded[1, i].GetInt32()); Assert.AreNotEqual(0, padded[1, i].GetInt32());
} }

[TestMethod]
public void TextToMatrixBinary()
{
var tokenizer = keras.preprocessing.text.Tokenizer();
tokenizer.fit_on_texts(texts);

Assert.AreEqual(27, tokenizer.word_index.Count);

var matrix = tokenizer.texts_to_matrix(texts);

Assert.AreEqual(texts.Length, matrix.shape[0]);

CompareLists(new double[] { 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, matrix[0].ToArray<double>());
CompareLists(new double[] { 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, matrix[1].ToArray<double>());
}

[TestMethod]
public void TextToMatrixCount()
{
var tokenizer = keras.preprocessing.text.Tokenizer();
tokenizer.fit_on_texts(texts);

Assert.AreEqual(27, tokenizer.word_index.Count);

var matrix = tokenizer.texts_to_matrix(texts, mode:"count");

Assert.AreEqual(texts.Length, matrix.shape[0]);

CompareLists(new double[] { 0, 2, 2, 2, 1, 2, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, matrix[0].ToArray<double>());
CompareLists(new double[] { 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, matrix[1].ToArray<double>());
}

[TestMethod]
public void TextToMatrixFrequency()
{
var tokenizer = keras.preprocessing.text.Tokenizer();
tokenizer.fit_on_texts(texts);

Assert.AreEqual(27, tokenizer.word_index.Count);

var matrix = tokenizer.texts_to_matrix(texts, mode: "freq");

Assert.AreEqual(texts.Length, matrix.shape[0]);

double t12 = 2.0 / 12.0;
double o12 = 1.0 / 12.0;
double t22 = 2.0 / 22.0;
double o22 = 1.0 / 22.0;

CompareLists(new double[] { 0, t12, t12, t12, o12, t12, t12, o12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, matrix[0].ToArray<double>());
CompareLists(new double[] { 0, 0, 0, 0, 0, o22, 0, 0, o22, o22, o22, o22, o22, o22, o22, o22, t22, o22, o22, o22, o22, o22, o22, o22, o22, o22, o22, o22 }, matrix[1].ToArray<double>());
}

[TestMethod]
public void TextToMatrixTDIDF()
{
var tokenizer = keras.preprocessing.text.Tokenizer();
tokenizer.fit_on_texts(texts);

Assert.AreEqual(27, tokenizer.word_index.Count);

var matrix = tokenizer.texts_to_matrix(texts, mode: "tfidf");

Assert.AreEqual(texts.Length, matrix.shape[0]);

double t1 = 1.1736001944781467;
double t2 = 0.69314718055994529;
double t3 = 1.860112299086919;
double t4 = 1.0986122886681098;
double t5 = 0.69314718055994529;

CompareLists(new double[] { 0, t1, t1, t1, t2, 0, t1, t2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, matrix[0].ToArray<double>());
CompareLists(new double[] { 0, 0, 0, 0, 0, 0, 0, 0, t5, t5, t5, t5, t5, t5, t5, t5, t3, t4, t4, t4, t4, t4, t4, t4, t4, t4, t4, t4 }, matrix[1].ToArray<double>());
}

private void CompareLists<T>(IList<T> expected, IList<T> actual)
{
Assert.AreEqual(expected.Count, actual.Count);
for (var i = 0; i < expected.Count; i++)
{
Assert.AreEqual(expected[i], actual[i]);
}
}

} }
} }

Loading…
Cancel
Save