|
|
@@ -5,8 +5,6 @@ using System.Text; |
|
|
|
using Tensorflow.Keras.Utils; |
|
|
|
using NumSharp; |
|
|
|
using System.Linq; |
|
|
|
using NumSharp.Utilities; |
|
|
|
using Tensorflow.Queues; |
|
|
|
|
|
|
|
namespace Tensorflow.Keras.Datasets |
|
|
|
{ |
|
|
@@ -17,10 +15,8 @@ namespace Tensorflow.Keras.Datasets |
|
|
|
/// </summary> |
|
|
|
public class Imdb |
|
|
|
{ |
|
|
|
//string origin_folder = "https://storage.googleapis.com/tensorflow/tf-keras-datasets/"; |
|
|
|
string origin_folder = "http://ai.stanford.edu/~amaas/data/sentiment/"; |
|
|
|
//string file_name = "imdb.npz"; |
|
|
|
string file_name = "aclImdb_v1.tar.gz"; |
|
|
|
string origin_folder = "https://storage.googleapis.com/tensorflow/tf-keras-datasets/"; |
|
|
|
string file_name = "imdb.npz"; |
|
|
|
string dest_folder = "imdb"; |
|
|
|
|
|
|
|
/// <summary> |
|
|
@@ -46,61 +42,33 @@ namespace Tensorflow.Keras.Datasets |
|
|
|
{ |
|
|
|
var dst = Download(); |
|
|
|
|
|
|
|
var vocab = BuildVocabulary(Path.Combine(dst, "imdb.vocab"), start_char, oov_char, index_from); |
|
|
|
|
|
|
|
var (x_train,y_train) = GetDataSet(Path.Combine(dst, "train")); |
|
|
|
var (x_test, y_test) = GetDataSet(Path.Combine(dst, "test")); |
|
|
|
|
|
|
|
return new DatasetPass |
|
|
|
var lines = File.ReadAllLines(Path.Combine(dst, "imdb_train.txt")); |
|
|
|
var x_train_string = new string[lines.Length]; |
|
|
|
var y_train = np.zeros(new int[] { lines.Length }, NPTypeCode.Int64); |
|
|
|
for (int i = 0; i < lines.Length; i++) |
|
|
|
{ |
|
|
|
Train = (x_train, y_train), |
|
|
|
Test = (x_test, y_test) |
|
|
|
}; |
|
|
|
} |
|
|
|
y_train[i] = long.Parse(lines[i].Substring(0, 1)); |
|
|
|
x_train_string[i] = lines[i].Substring(2); |
|
|
|
} |
|
|
|
|
|
|
|
private static Dictionary<string, int> BuildVocabulary(string path, |
|
|
|
int start_char, |
|
|
|
int oov_char, |
|
|
|
int index_from) |
|
|
|
{ |
|
|
|
var words = File.ReadAllLines(path); |
|
|
|
var result = new Dictionary<string, int>(); |
|
|
|
var idx = index_from; |
|
|
|
var x_train = np.array(x_train_string); |
|
|
|
|
|
|
|
foreach (var word in words) |
|
|
|
File.ReadAllLines(Path.Combine(dst, "imdb_test.txt")); |
|
|
|
var x_test_string = new string[lines.Length]; |
|
|
|
var y_test = np.zeros(new int[] { lines.Length }, NPTypeCode.Int64); |
|
|
|
for (int i = 0; i < lines.Length; i++) |
|
|
|
{ |
|
|
|
result[word] = idx; |
|
|
|
idx += 1; |
|
|
|
y_test[i] = long.Parse(lines[i].Substring(0, 1)); |
|
|
|
x_test_string[i] = lines[i].Substring(2); |
|
|
|
} |
|
|
|
|
|
|
|
return result; |
|
|
|
} |
|
|
|
|
|
|
|
private static (NDArray, NDArray) GetDataSet(string path) |
|
|
|
{ |
|
|
|
var posFiles = Directory.GetFiles(Path.Combine(path, "pos")).Slice(0,10); |
|
|
|
var negFiles = Directory.GetFiles(Path.Combine(path, "neg")).Slice(0,10); |
|
|
|
|
|
|
|
var x_string = new string[posFiles.Length + negFiles.Length]; |
|
|
|
var y = new int[posFiles.Length + negFiles.Length]; |
|
|
|
var trg = 0; |
|
|
|
var longest = 0; |
|
|
|
var x_test = np.array(x_test_string); |
|
|
|
|
|
|
|
for (int i = 0; i < posFiles.Length; i++, trg++) |
|
|
|
{ |
|
|
|
y[trg] = 1; |
|
|
|
x_string[trg] = File.ReadAllText(posFiles[i]); |
|
|
|
longest = Math.Max(longest, x_string[trg].Length); |
|
|
|
} |
|
|
|
for (int i = 0; i < posFiles.Length; i++, trg++) |
|
|
|
return new DatasetPass |
|
|
|
{ |
|
|
|
y[trg] = 0; |
|
|
|
x_string[trg] = File.ReadAllText(negFiles[i]); |
|
|
|
longest = Math.Max(longest, x_string[trg].Length); |
|
|
|
} |
|
|
|
var x = np.array(x_string); |
|
|
|
|
|
|
|
return (x, y); |
|
|
|
Train = (x_train, y_train), |
|
|
|
Test = (x_test, y_test) |
|
|
|
}; |
|
|
|
} |
|
|
|
|
|
|
|
(NDArray, NDArray) LoadX(byte[] bytes) |
|
|
@@ -122,9 +90,8 @@ namespace Tensorflow.Keras.Datasets |
|
|
|
|
|
|
|
Web.Download(origin_folder + file_name, dst, file_name); |
|
|
|
|
|
|
|
Tensorflow.Keras.Utils.Compress.ExtractTGZ(Path.Combine(dst, file_name), dst); |
|
|
|
|
|
|
|
return Path.Combine(dst, "aclImdb"); |
|
|
|
return dst; |
|
|
|
// return Path.Combine(dst, file_name); |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |