using System; using System.Collections.Generic; using System.IO; using System.Text; using Tensorflow.Keras.Utils; namespace Tensorflow.Keras.Datasets { /// /// This is a dataset of 25,000 movies reviews from IMDB, labeled by sentiment /// (positive/negative). Reviews have been preprocessed, and each review is /// encoded as a list of word indexes(integers). /// For convenience, words are indexed by overall frequency in the dataset, /// so that for instance the integer "3" encodes the 3rd most frequent word in /// the data.This allows for quick filtering operations such as: /// "only consider the top 10,000 most /// common words, but eliminate the top 20 most common words". /// As a convention, "0" does not stand for a specific word, but instead is used /// to encode the pad token. /// Args: /// path: where to cache the data (relative to %TEMP%/imdb/imdb.npz). /// num_words: integer or None.Words are /// ranked by how often they occur(in the training set) and only /// the `num_words` most frequent words are kept.Any less frequent word /// will appear as `oov_char` value in the sequence data.If None, /// all words are kept.Defaults to `None`. /// skip_top: skip the top N most frequently occurring words /// (which may not be informative). These words will appear as /// `oov_char` value in the dataset.When 0, no words are /// skipped. Defaults to `0`. /// maxlen: int or None.Maximum sequence length. /// Any longer sequence will be truncated. None, means no truncation. /// Defaults to `None`. /// seed: int. Seed for reproducible data shuffling. /// start_char: int. The start of a sequence will be marked with this /// character. 0 is usually the padding character. Defaults to `1`. /// oov_char: int. The out-of-vocabulary character. /// Words that were cut out because of the `num_words` or /// `skip_top` limits will be replaced with this character. /// index_from: int. Index actual words with this index and higher. /// Returns: /// Tuple of Numpy arrays: `(x_train, labels_train), (x_test, labels_test)`. /// /// ** x_train, x_test**: lists of sequences, which are lists of indexes /// (integers). If the num_words argument was specific, the maximum /// possible index value is `num_words - 1`. If the `maxlen` argument was /// specified, the largest possible sequence length is `maxlen`. /// /// ** labels_train, labels_test**: lists of integer labels(1 or 0). /// /// Raises: /// ValueError: in case `maxlen` is so low /// that no input sequence could be kept. /// Note that the 'out of vocabulary' character is only used for /// words that were present in the training set but are not included /// because they're not making the `num_words` cut here. /// Words that were not seen in the training set but are in the test set /// have simply been skipped. /// /// """Loads the [IMDB dataset](https://ai.stanford.edu/~amaas/data/sentiment/). public class Imdb { string origin_folder = "https://storage.googleapis.com/tensorflow/tf-keras-datasets/"; string dest_folder = "imdb"; /// /// Loads the [IMDB dataset](https://ai.stanford.edu/~amaas/data/sentiment/). /// /// /// /// /// /// /// /// /// /// public DatasetPass load_data( string path = "imdb.npz", int? num_words = null, int skip_top = 0, int? maxlen = null, int seed = 113, int? start_char = 1, int? oov_char = 2, int index_from = 3) { path = data_utils.get_file( path, origin: Path.Combine(origin_folder, "imdb.npz"), file_hash: "69664113be75683a8fe16e3ed0ab59fda8886cb3cd7ada244f7d9544e4676b9f" ); path = Path.Combine(path, "imdb.npz"); var fileBytes = File.ReadAllBytes(path); var (x_train, x_test) = LoadX(fileBytes); var (labels_train, labels_test) = LoadY(fileBytes); var indices = np.arange(len(x_train)); np.random.shuffle(indices, seed); x_train = x_train[indices]; labels_train = labels_train[indices]; indices = np.arange(len(x_test)); np.random.shuffle(indices, seed); x_test = x_test[indices]; labels_test = labels_test[indices]; var x_train_array = (int[,])x_train.ToMultiDimArray(); var x_test_array = (int[,])x_test.ToMultiDimArray(); var labels_train_array = (long[])labels_train.ToArray(); var labels_test_array = (long[])labels_test.ToArray(); if (start_char != null) { int[,] new_x_train_array = new int[x_train_array.GetLength(0), x_train_array.GetLength(1) + 1]; for (var i = 0; i < x_train_array.GetLength(0); i++) { new_x_train_array[i, 0] = (int)start_char; Array.Copy(x_train_array, i * x_train_array.GetLength(1), new_x_train_array, i * new_x_train_array.GetLength(1) + 1, x_train_array.GetLength(1)); } int[,] new_x_test_array = new int[x_test_array.GetLength(0), x_test_array.GetLength(1) + 1]; for (var i = 0; i < x_test_array.GetLength(0); i++) { new_x_test_array[i, 0] = (int)start_char; Array.Copy(x_test_array, i * x_test_array.GetLength(1), new_x_test_array, i * new_x_test_array.GetLength(1) + 1, x_test_array.GetLength(1)); } x_train_array = new_x_train_array; x_test_array = new_x_test_array; } else if (index_from != 0) { for (var i = 0; i < x_train_array.GetLength(0); i++) { for (var j = 0; j < x_train_array.GetLength(1); j++) { if (x_train_array[i, j] == 0) break; x_train_array[i, j] += index_from; } } for (var i = 0; i < x_test_array.GetLength(0); i++) { for (var j = 0; j < x_test_array.GetLength(1); j++) { if (x_test_array[i, j] == 0) break; x_test[i, j] += index_from; } } } if (maxlen == null) { maxlen = max(x_train_array.GetLength(1), x_test_array.GetLength(1)); } (x_train_array, labels_train_array) = data_utils._remove_long_seq((int)maxlen, x_train_array, labels_train_array); (x_test_array, labels_test_array) = data_utils._remove_long_seq((int)maxlen, x_test_array, labels_test_array); if (x_train_array.Length == 0 || x_test_array.Length == 0) throw new ValueError("After filtering for sequences shorter than maxlen=" + $"{maxlen}, no sequence was kept. Increase maxlen."); int[,] xs_array = new int[x_train_array.GetLength(0) + x_test_array.GetLength(0), (int)maxlen]; Array.Copy(x_train_array, xs_array, x_train_array.Length); Array.Copy(x_test_array, 0, xs_array, x_train_array.Length, x_train_array.Length); long[] labels_array = new long[labels_train_array.Length + labels_test_array.Length]; Array.Copy(labels_train_array, labels_array, labels_train_array.Length); Array.Copy(labels_test_array, 0, labels_array, labels_train_array.Length, labels_test_array.Length); if (num_words == null) { num_words = 0; for (var i = 0; i < xs_array.GetLength(0); i++) for (var j = 0; j < xs_array.GetLength(1); j++) num_words = max((int)num_words, (int)xs_array[i, j]); } // by convention, use 2 as OOV word // reserve 'index_from' (=3 by default) characters: // 0 (padding), 1 (start), 2 (OOV) if (oov_char != null) { int[,] new_xs_array = new int[xs_array.GetLength(0), xs_array.GetLength(1)]; for (var i = 0; i < xs_array.GetLength(0); i++) { for (var j = 0; j < xs_array.GetLength(1); j++) { if (xs_array[i, j] == 0 || skip_top <= xs_array[i, j] && xs_array[i, j] < num_words) new_xs_array[i, j] = xs_array[i, j]; else new_xs_array[i, j] = (int)oov_char; } } xs_array = new_xs_array; } else { int[,] new_xs_array = new int[xs_array.GetLength(0), xs_array.GetLength(1)]; for (var i = 0; i < xs_array.GetLength(0); i++) { int k = 0; for (var j = 0; j < xs_array.GetLength(1); j++) { if (xs_array[i, j] == 0 || skip_top <= xs_array[i, j] && xs_array[i, j] < num_words) new_xs_array[i, k++] = xs_array[i, j]; } } xs_array = new_xs_array; } Array.Copy(xs_array, x_train_array, x_train_array.Length); Array.Copy(xs_array, x_train_array.Length, x_test_array, 0, x_train_array.Length); Array.Copy(labels_array, labels_train_array, labels_train_array.Length); Array.Copy(labels_array, labels_train_array.Length, labels_test_array, 0, labels_test_array.Length); return new DatasetPass { Train = (x_train_array, labels_train_array), Test = (x_test_array, labels_test_array) }; } (NDArray, NDArray) LoadX(byte[] bytes) { var x = np.Load_Npz(bytes); return (x["x_train.npy"], x["x_test.npy"]); } (NDArray, NDArray) LoadY(byte[] bytes) { var y = np.Load_Npz(bytes); return (y["y_train.npy"], y["y_test.npy"]); } } }