using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using Tensorflow.Keras.Utils;
using Tensorflow.NumPy;
using System.Linq;
namespace Tensorflow.Keras.Datasets
{
///
/// This is a dataset of 25,000 movies reviews from IMDB, labeled by sentiment
/// (positive/negative). Reviews have been preprocessed, and each review is
/// encoded as a list of word indexes(integers).
///
public class Imdb
{
string origin_folder = "https://storage.googleapis.com/tensorflow/tf-keras-datasets/";
string file_name = "imdb.npz";
string dest_folder = "imdb";
///
/// Loads the [IMDB dataset](https://ai.stanford.edu/~amaas/data/sentiment/).
///
///
///
///
///
///
///
///
///
///
public DatasetPass load_data(string path = "imdb.npz",
int num_words = -1,
int skip_top = 0,
int maxlen = -1,
int seed = 113,
int start_char = 1,
int oov_char= 2,
int index_from = 3)
{
if (maxlen == -1) throw new InvalidArgumentError("maxlen must be assigned.");
var dst = Download();
var lines = File.ReadAllLines(Path.Combine(dst, "imdb_train.txt"));
var x_train_string = new string[lines.Length];
var y_train = np.zeros(new int[] { lines.Length }, np.int64);
for (int i = 0; i < lines.Length; i++)
{
y_train[i] = long.Parse(lines[i].Substring(0, 1));
x_train_string[i] = lines[i].Substring(2);
}
var x_train = keras.preprocessing.sequence.pad_sequences(PraseData(x_train_string), maxlen: maxlen);
File.ReadAllLines(Path.Combine(dst, "imdb_test.txt"));
var x_test_string = new string[lines.Length];
var y_test = np.zeros(new int[] { lines.Length }, np.int64);
for (int i = 0; i < lines.Length; i++)
{
y_test[i] = long.Parse(lines[i].Substring(0, 1));
x_test_string[i] = lines[i].Substring(2);
}
var x_test = keras.preprocessing.sequence.pad_sequences(PraseData(x_test_string), maxlen: maxlen);
return new DatasetPass
{
Train = (x_train, y_train),
Test = (x_test, y_test)
};
}
(NDArray, NDArray) LoadX(byte[] bytes)
{
var y = np.Load_Npz(bytes);
return (y["x_train.npy"], y["x_test.npy"]);
}
(NDArray, NDArray) LoadY(byte[] bytes)
{
var y = np.Load_Npz(bytes);
return (y["y_train.npy"], y["y_test.npy"]);
}
string Download()
{
var dst = Path.Combine(Path.GetTempPath(), dest_folder);
Directory.CreateDirectory(dst);
Web.Download(origin_folder + file_name, dst, file_name);
return dst;
// return Path.Combine(dst, file_name);
}
protected IEnumerable PraseData(string[] x)
{
var data_list = new List();
for (int i = 0; i < len(x); i++)
{
var list_string = x[i];
var cleaned_list_string = list_string.Replace("[", "").Replace("]", "").Replace(" ", "");
string[] number_strings = cleaned_list_string.Split(',');
int[] numbers = new int[number_strings.Length];
for (int j = 0; j < number_strings.Length; j++)
{
numbers[j] = int.Parse(number_strings[j]);
}
data_list.Add(numbers);
}
return data_list;
}
}
}