This reverts commit a468cd4c0c
.
pull/756/head
@@ -5,6 +5,8 @@ using System.Text; | |||
using Tensorflow.Keras.Utils; | |||
using NumSharp; | |||
using System.Linq; | |||
using NumSharp.Utilities; | |||
using Tensorflow.Queues; | |||
namespace Tensorflow.Keras.Datasets | |||
{ | |||
@@ -15,8 +17,10 @@ namespace Tensorflow.Keras.Datasets | |||
/// </summary> | |||
public class Imdb | |||
{ | |||
string origin_folder = "https://storage.googleapis.com/tensorflow/tf-keras-datasets/"; | |||
string file_name = "imdb.npz"; | |||
//string origin_folder = "https://storage.googleapis.com/tensorflow/tf-keras-datasets/"; | |||
string origin_folder = "http://ai.stanford.edu/~amaas/data/sentiment/"; | |||
//string file_name = "imdb.npz"; | |||
string file_name = "aclImdb_v1.tar.gz"; | |||
string dest_folder = "imdb"; | |||
/// <summary> | |||
@@ -37,38 +41,66 @@ namespace Tensorflow.Keras.Datasets | |||
int maxlen = -1, | |||
int seed = 113, | |||
int start_char = 1, | |||
int oov_char= 2, | |||
int oov_char = 2, | |||
int index_from = 3) | |||
{ | |||
var dst = Download(); | |||
var lines = File.ReadAllLines(Path.Combine(dst, "imdb_train.txt")); | |||
var x_train_string = new string[lines.Length]; | |||
var y_train = np.zeros(new int[] { lines.Length }, NPTypeCode.Int64); | |||
for (int i = 0; i < lines.Length; i++) | |||
var vocab = BuildVocabulary(Path.Combine(dst, "imdb.vocab"), start_char, oov_char, index_from); | |||
var (x_train,y_train) = GetDataSet(Path.Combine(dst, "train")); | |||
var (x_test, y_test) = GetDataSet(Path.Combine(dst, "test")); | |||
return new DatasetPass | |||
{ | |||
y_train[i] = long.Parse(lines[i].Substring(0, 1)); | |||
x_train_string[i] = lines[i].Substring(2); | |||
} | |||
Train = (x_train, y_train), | |||
Test = (x_test, y_test) | |||
}; | |||
} | |||
var x_train = np.array(x_train_string); | |||
private static Dictionary<string, int> BuildVocabulary(string path, | |||
int start_char, | |||
int oov_char, | |||
int index_from) | |||
{ | |||
var words = File.ReadAllLines(path); | |||
var result = new Dictionary<string, int>(); | |||
var idx = index_from; | |||
File.ReadAllLines(Path.Combine(dst, "imdb_test.txt")); | |||
var x_test_string = new string[lines.Length]; | |||
var y_test = np.zeros(new int[] { lines.Length }, NPTypeCode.Int64); | |||
for (int i = 0; i < lines.Length; i++) | |||
foreach (var word in words) | |||
{ | |||
y_test[i] = long.Parse(lines[i].Substring(0, 1)); | |||
x_test_string[i] = lines[i].Substring(2); | |||
result[word] = idx; | |||
idx += 1; | |||
} | |||
var x_test = np.array(x_test_string); | |||
return result; | |||
} | |||
return new DatasetPass | |||
private static (NDArray, NDArray) GetDataSet(string path) | |||
{ | |||
var posFiles = Directory.GetFiles(Path.Combine(path, "pos")).Slice(0,10); | |||
var negFiles = Directory.GetFiles(Path.Combine(path, "neg")).Slice(0,10); | |||
var x_string = new string[posFiles.Length + negFiles.Length]; | |||
var y = new int[posFiles.Length + negFiles.Length]; | |||
var trg = 0; | |||
var longest = 0; | |||
for (int i = 0; i < posFiles.Length; i++, trg++) | |||
{ | |||
Train = (x_train, y_train), | |||
Test = (x_test, y_test) | |||
}; | |||
y[trg] = 1; | |||
x_string[trg] = File.ReadAllText(posFiles[i]); | |||
longest = Math.Max(longest, x_string[trg].Length); | |||
} | |||
for (int i = 0; i < posFiles.Length; i++, trg++) | |||
{ | |||
y[trg] = 0; | |||
x_string[trg] = File.ReadAllText(negFiles[i]); | |||
longest = Math.Max(longest, x_string[trg].Length); | |||
} | |||
var x = np.array(x_string); | |||
return (x, y); | |||
} | |||
(NDArray, NDArray) LoadX(byte[] bytes) | |||
@@ -90,8 +122,9 @@ namespace Tensorflow.Keras.Datasets | |||
Web.Download(origin_folder + file_name, dst, file_name); | |||
return dst; | |||
// return Path.Combine(dst, file_name); | |||
Tensorflow.Keras.Utils.Compress.ExtractTGZ(Path.Combine(dst, file_name), dst); | |||
return Path.Combine(dst, "aclImdb"); | |||
} | |||
} | |||
} |
@@ -56,7 +56,7 @@ namespace Tensorflow.Keras.Text | |||
/// <summary> | |||
/// Updates internal vocabulary based on a list of texts. | |||
/// </summary> | |||
/// <param name="texts"></param> | |||
/// <param name="texts">A list of strings, each containing one or more tokens.</param> | |||
/// <remarks>Required before using texts_to_sequences or texts_to_matrix.</remarks> | |||
public void fit_on_texts(IEnumerable<string> texts) | |||
{ | |||
@@ -90,7 +90,7 @@ namespace Tensorflow.Keras.Text | |||
} | |||
var wcounts = word_counts.AsEnumerable().ToList(); | |||
wcounts.Sort((kv1, kv2) => -kv1.Value.CompareTo(kv2.Value)); | |||
wcounts.Sort((kv1, kv2) => -kv1.Value.CompareTo(kv2.Value)); // Note: '-' gives us descending order. | |||
var sorted_voc = (oov_token == null) ? new List<string>() : new List<string>() { oov_token }; | |||
sorted_voc.AddRange(word_counts.Select(kv => kv.Key)); | |||
@@ -120,7 +120,12 @@ namespace Tensorflow.Keras.Text | |||
} | |||
} | |||
public void fit_on_texts(IEnumerable<IList<string>> texts) | |||
/// <summary> | |||
/// Updates internal vocabulary based on a list of texts. | |||
/// </summary> | |||
/// <param name="texts">A list of list of strings, each containing one token.</param> | |||
/// <remarks>Required before using texts_to_sequences or texts_to_matrix.</remarks> | |||
public void fit_on_texts(IEnumerable<IEnumerable<string>> texts) | |||
{ | |||
foreach (var seq in texts) | |||
{ | |||
@@ -197,7 +202,7 @@ namespace Tensorflow.Keras.Text | |||
/// <param name="texts"></param> | |||
/// <returns></returns> | |||
/// <remarks>Only top num_words-1 most frequent words will be taken into account.Only words known by the tokenizer will be taken into account.</remarks> | |||
public IList<int[]> texts_to_sequences(IEnumerable<IList<string>> texts) | |||
public IList<int[]> texts_to_sequences(IEnumerable<IEnumerable<string>> texts) | |||
{ | |||
return texts_to_sequences_generator(texts).ToArray(); | |||
} | |||
@@ -224,6 +229,13 @@ namespace Tensorflow.Keras.Text | |||
}); | |||
} | |||
public IEnumerable<int[]> texts_to_sequences_generator(IEnumerable<IEnumerable<string>> texts) | |||
{ | |||
int oov_index = -1; | |||
var _ = (oov_token != null) && word_index.TryGetValue(oov_token, out oov_index); | |||
return texts.Select(seq => ConvertToSequence(oov_index, seq).ToArray()); | |||
} | |||
private List<int> ConvertToSequence(int oov_index, IEnumerable<string> seq) | |||
{ | |||
var vect = new List<int>(); | |||
@@ -244,7 +256,7 @@ namespace Tensorflow.Keras.Text | |||
vect.Add(i); | |||
} | |||
} | |||
else if(oov_index != -1) | |||
else if (oov_index != -1) | |||
{ | |||
vect.Add(oov_index); | |||
} | |||
@@ -253,13 +265,6 @@ namespace Tensorflow.Keras.Text | |||
return vect; | |||
} | |||
public IEnumerable<int[]> texts_to_sequences_generator(IEnumerable<IList<string>> texts) | |||
{ | |||
int oov_index = -1; | |||
var _ = (oov_token != null) && word_index.TryGetValue(oov_token, out oov_index); | |||
return texts.Select(seq => ConvertToSequence(oov_index, seq).ToArray()); | |||
} | |||
/// <summary> | |||
/// Transforms each sequence into a list of text. | |||
/// </summary> | |||
@@ -271,7 +276,7 @@ namespace Tensorflow.Keras.Text | |||
return sequences_to_texts_generator(sequences).ToArray(); | |||
} | |||
public IEnumerable<string> sequences_to_texts_generator(IEnumerable<int[]> sequences) | |||
public IEnumerable<string> sequences_to_texts_generator(IEnumerable<IList<int>> sequences) | |||
{ | |||
int oov_index = -1; | |||
var _ = (oov_token != null) && word_index.TryGetValue(oov_token, out oov_index); | |||
@@ -280,7 +285,7 @@ namespace Tensorflow.Keras.Text | |||
{ | |||
var bldr = new StringBuilder(); | |||
for (var i = 0; i < seq.Length; i++) | |||
for (var i = 0; i < seq.Count; i++) | |||
{ | |||
if (i > 0) bldr.Append(' '); | |||
@@ -314,7 +319,7 @@ namespace Tensorflow.Keras.Text | |||
/// </summary> | |||
/// <param name="sequences"></param> | |||
/// <returns></returns> | |||
public NDArray sequences_to_matrix(IEnumerable<int[]> sequences) | |||
public NDArray sequences_to_matrix(IEnumerable<IList<int>> sequences) | |||
{ | |||
throw new NotImplementedException("sequences_to_matrix"); | |||
} | |||
@@ -15,23 +15,23 @@ namespace TensorFlowNET.Keras.UnitTest | |||
{ | |||
private readonly string[] texts = new string[] { | |||
"It was the best of times, it was the worst of times.", | |||
"this is a new dawn, an era to follow the previous era. It can not be said to start anew.", | |||
"Mr and Mrs Dursley of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much.", | |||
"It was the best of times, it was the worst of times.", | |||
"this is a new dawn, an era to follow the previous era.", | |||
"Mr and Mrs Dursley of number four, Privet Drive.", | |||
}; | |||
private readonly string[][] tokenized_texts = new string[][] { | |||
new string[] {"It","was","the","best","of","times","it","was","the","worst","of","times"}, | |||
new string[] {"this","is","a","new","dawn","an","era","to","follow","the","previous","era","It","can","not","be","said","to","start","anew" }, | |||
new string[] {"mr","and","mrs","dursley","of","number","four","privet","drive","were","proud","to","say","that","they","were","perfectly","normal","thank","you","very","much"}, | |||
new string[] {"It","was","the","best","of","times","it","was","the","worst","of","times"}, | |||
new string[] {"this","is","a","new","dawn","an","era","to","follow","the","previous","era" }, | |||
new string[] {"mr","and","mrs","dursley","of","number","four","privet","drive"}, | |||
}; | |||
private readonly string[] processed_texts = new string[] { | |||
"it was the best of times it was the worst of times", | |||
"this is a new dawn an era to follow the previous era it can not be said to start anew", | |||
"mr and mrs dursley of number four privet drive were proud to say that they were perfectly normal thank you very much", | |||
"it was the best of times it was the worst of times", | |||
"this is a new dawn an era to follow the previous era", | |||
"mr and mrs dursley of number four privet drive", | |||
}; | |||
private const string OOV = "<OOV>"; | |||
@@ -42,11 +42,11 @@ namespace TensorFlowNET.Keras.UnitTest | |||
var tokenizer = keras.preprocessing.text.Tokenizer(); | |||
tokenizer.fit_on_texts(texts); | |||
Assert.AreEqual(23, tokenizer.word_index.Count); | |||
Assert.AreEqual(27, tokenizer.word_index.Count); | |||
Assert.AreEqual(7, tokenizer.word_index["worst"]); | |||
Assert.AreEqual(12, tokenizer.word_index["dawn"]); | |||
Assert.AreEqual(16, tokenizer.word_index["follow"]); | |||
Assert.AreEqual(12, tokenizer.word_index["number"]); | |||
Assert.AreEqual(16, tokenizer.word_index["were"]); | |||
} | |||
[TestMethod] | |||
@@ -56,11 +56,11 @@ namespace TensorFlowNET.Keras.UnitTest | |||
// Use the list version, where the tokenization has already been done. | |||
tokenizer.fit_on_texts(tokenized_texts); | |||
Assert.AreEqual(23, tokenizer.word_index.Count); | |||
Assert.AreEqual(27, tokenizer.word_index.Count); | |||
Assert.AreEqual(7, tokenizer.word_index["worst"]); | |||
Assert.AreEqual(12, tokenizer.word_index["dawn"]); | |||
Assert.AreEqual(16, tokenizer.word_index["follow"]); | |||
Assert.AreEqual(12, tokenizer.word_index["number"]); | |||
Assert.AreEqual(16, tokenizer.word_index["were"]); | |||
} | |||
[TestMethod] | |||
@@ -69,12 +69,12 @@ namespace TensorFlowNET.Keras.UnitTest | |||
var tokenizer = keras.preprocessing.text.Tokenizer(oov_token: OOV); | |||
tokenizer.fit_on_texts(texts); | |||
Assert.AreEqual(24, tokenizer.word_index.Count); | |||
Assert.AreEqual(28, tokenizer.word_index.Count); | |||
Assert.AreEqual(1, tokenizer.word_index[OOV]); | |||
Assert.AreEqual(8, tokenizer.word_index["worst"]); | |||
Assert.AreEqual(13, tokenizer.word_index["dawn"]); | |||
Assert.AreEqual(17, tokenizer.word_index["follow"]); | |||
Assert.AreEqual(13, tokenizer.word_index["number"]); | |||
Assert.AreEqual(17, tokenizer.word_index["were"]); | |||
} | |||
[TestMethod] | |||
@@ -84,12 +84,12 @@ namespace TensorFlowNET.Keras.UnitTest | |||
// Use the list version, where the tokenization has already been done. | |||
tokenizer.fit_on_texts(tokenized_texts); | |||
Assert.AreEqual(24, tokenizer.word_index.Count); | |||
Assert.AreEqual(28, tokenizer.word_index.Count); | |||
Assert.AreEqual(1, tokenizer.word_index[OOV]); | |||
Assert.AreEqual(8, tokenizer.word_index["worst"]); | |||
Assert.AreEqual(13, tokenizer.word_index["dawn"]); | |||
Assert.AreEqual(17, tokenizer.word_index["follow"]); | |||
Assert.AreEqual(13, tokenizer.word_index["number"]); | |||
Assert.AreEqual(17, tokenizer.word_index["were"]); | |||
} | |||
[TestMethod] | |||
@@ -102,7 +102,7 @@ namespace TensorFlowNET.Keras.UnitTest | |||
Assert.AreEqual(4, sequences.Count); | |||
Assert.AreEqual(tokenizer.word_index["worst"], sequences[0][9]); | |||
Assert.AreEqual(tokenizer.word_index["previous"], sequences[1][10]); | |||
Assert.AreEqual(tokenizer.word_index["proud"], sequences[1][10]); | |||
} | |||
[TestMethod] | |||
@@ -116,7 +116,7 @@ namespace TensorFlowNET.Keras.UnitTest | |||
Assert.AreEqual(4, sequences.Count); | |||
Assert.AreEqual(tokenizer.word_index["worst"], sequences[0][9]); | |||
Assert.AreEqual(tokenizer.word_index["previous"], sequences[1][10]); | |||
Assert.AreEqual(tokenizer.word_index["proud"], sequences[1][10]); | |||
} | |||
[TestMethod] | |||
@@ -200,7 +200,7 @@ namespace TensorFlowNET.Keras.UnitTest | |||
Assert.AreEqual(4, sequences.Count); | |||
Assert.AreEqual(tokenizer.word_index["worst"], sequences[0][9]); | |||
Assert.AreEqual(tokenizer.word_index["previous"], sequences[1][10]); | |||
Assert.AreEqual(tokenizer.word_index["proud"], sequences[1][10]); | |||
for (var i = 0; i < sequences.Count; i++) | |||
for (var j = 0; j < sequences[i].Length; j++) | |||
@@ -217,7 +217,7 @@ namespace TensorFlowNET.Keras.UnitTest | |||
Assert.AreEqual(4, sequences.Count); | |||
Assert.AreEqual(tokenizer.word_index["worst"], sequences[0][9]); | |||
Assert.AreEqual(tokenizer.word_index["previous"], sequences[1][10]); | |||
Assert.AreEqual(tokenizer.word_index["proud"], sequences[1][10]); | |||
var oov_count = 0; | |||
for (var i = 0; i < sequences.Count; i++) | |||
@@ -225,7 +225,7 @@ namespace TensorFlowNET.Keras.UnitTest | |||
if (tokenizer.word_index[OOV] == sequences[i][j]) | |||
oov_count += 1; | |||
Assert.AreEqual(5, oov_count); | |||
Assert.AreEqual(9, oov_count); | |||
} | |||
[TestMethod] | |||
@@ -238,15 +238,15 @@ namespace TensorFlowNET.Keras.UnitTest | |||
var padded = keras.preprocessing.sequence.pad_sequences(sequences); | |||
Assert.AreEqual(4, padded.shape[0]); | |||
Assert.AreEqual(20, padded.shape[1]); | |||
Assert.AreEqual(22, padded.shape[1]); | |||
var firstRow = padded[0]; | |||
var secondRow = padded[1]; | |||
Assert.AreEqual(tokenizer.word_index["worst"], padded[0, 17].GetInt32()); | |||
Assert.AreEqual(tokenizer.word_index["worst"], padded[0, 19].GetInt32()); | |||
for (var i = 0; i < 8; i++) | |||
Assert.AreEqual(0, padded[0, i].GetInt32()); | |||
Assert.AreEqual(tokenizer.word_index["previous"], padded[1, 10].GetInt32()); | |||
Assert.AreEqual(tokenizer.word_index["proud"], padded[1, 10].GetInt32()); | |||
for (var i = 0; i < 20; i++) | |||
Assert.AreNotEqual(0, padded[1, i].GetInt32()); | |||
} | |||
@@ -269,7 +269,7 @@ namespace TensorFlowNET.Keras.UnitTest | |||
Assert.AreEqual(tokenizer.word_index["worst"], padded[0, 12].GetInt32()); | |||
for (var i = 0; i < 3; i++) | |||
Assert.AreEqual(0, padded[0, i].GetInt32()); | |||
Assert.AreEqual(tokenizer.word_index["previous"], padded[1, 5].GetInt32()); | |||
Assert.AreEqual(tokenizer.word_index["proud"], padded[1, 3].GetInt32()); | |||
for (var i = 0; i < 15; i++) | |||
Assert.AreNotEqual(0, padded[1, i].GetInt32()); | |||
} | |||
@@ -292,7 +292,7 @@ namespace TensorFlowNET.Keras.UnitTest | |||
Assert.AreEqual(tokenizer.word_index["worst"], padded[0, 9].GetInt32()); | |||
for (var i = 12; i < 15; i++) | |||
Assert.AreEqual(0, padded[0, i].GetInt32()); | |||
Assert.AreEqual(tokenizer.word_index["previous"], padded[1, 10].GetInt32()); | |||
Assert.AreEqual(tokenizer.word_index["proud"], padded[1, 10].GetInt32()); | |||
for (var i = 0; i < 15; i++) | |||
Assert.AreNotEqual(0, padded[1, i].GetInt32()); | |||
} | |||