|
- using System;
- using System.Collections.Generic;
- using System.IO;
- using System.Text;
- using Tensorflow.Keras.Utils;
-
- namespace Tensorflow.Keras.Datasets
- {
- /// <summary>
- /// This is a dataset of 25,000 movies reviews from IMDB, labeled by sentiment
- /// (positive/negative). Reviews have been preprocessed, and each review is
- /// encoded as a list of word indexes(integers).
- /// For convenience, words are indexed by overall frequency in the dataset,
- /// so that for instance the integer "3" encodes the 3rd most frequent word in
- /// the data.This allows for quick filtering operations such as:
- /// "only consider the top 10,000 most
- /// common words, but eliminate the top 20 most common words".
- /// As a convention, "0" does not stand for a specific word, but instead is used
- /// to encode the pad token.
- /// Args:
- /// path: where to cache the data (relative to %TEMP%/imdb/imdb.npz).
- /// num_words: integer or None.Words are
- /// ranked by how often they occur(in the training set) and only
- /// the `num_words` most frequent words are kept.Any less frequent word
- /// will appear as `oov_char` value in the sequence data.If None,
- /// all words are kept.Defaults to `None`.
- /// skip_top: skip the top N most frequently occurring words
- /// (which may not be informative). These words will appear as
- /// `oov_char` value in the dataset.When 0, no words are
- /// skipped. Defaults to `0`.
- /// maxlen: int or None.Maximum sequence length.
- /// Any longer sequence will be truncated. None, means no truncation.
- /// Defaults to `None`.
- /// seed: int. Seed for reproducible data shuffling.
- /// start_char: int. The start of a sequence will be marked with this
- /// character. 0 is usually the padding character. Defaults to `1`.
- /// oov_char: int. The out-of-vocabulary character.
- /// Words that were cut out because of the `num_words` or
- /// `skip_top` limits will be replaced with this character.
- /// index_from: int. Index actual words with this index and higher.
- /// Returns:
- /// Tuple of Numpy arrays: `(x_train, labels_train), (x_test, labels_test)`.
- ///
- /// ** x_train, x_test**: lists of sequences, which are lists of indexes
- /// (integers). If the num_words argument was specific, the maximum
- /// possible index value is `num_words - 1`. If the `maxlen` argument was
- /// specified, the largest possible sequence length is `maxlen`.
- ///
- /// ** labels_train, labels_test**: lists of integer labels(1 or 0).
- ///
- /// Raises:
- /// ValueError: in case `maxlen` is so low
- /// that no input sequence could be kept.
- /// Note that the 'out of vocabulary' character is only used for
- /// words that were present in the training set but are not included
- /// because they're not making the `num_words` cut here.
- /// Words that were not seen in the training set but are in the test set
- /// have simply been skipped.
- /// </summary>
- /// """Loads the [IMDB dataset](https://ai.stanford.edu/~amaas/data/sentiment/).
- public class Imdb
- {
- string origin_folder = "https://storage.googleapis.com/tensorflow/tf-keras-datasets/";
- string dest_folder = "imdb";
-
- /// <summary>
- /// Loads the [IMDB dataset](https://ai.stanford.edu/~amaas/data/sentiment/).
- /// </summary>
- /// <param name="path"></param>
- /// <param name="num_words"></param>
- /// <param name="skip_top"></param>
- /// <param name="maxlen"></param>
- /// <param name="seed"></param>
- /// <param name="start_char"></param>
- /// <param name="oov_char"></param>
- /// <param name="index_from"></param>
- /// <returns></returns>
- public DatasetPass load_data(
- string path = "imdb.npz",
- int? num_words = null,
- int skip_top = 0,
- int? maxlen = null,
- int seed = 113,
- int? start_char = 1,
- int? oov_char = 2,
- int index_from = 3)
- {
- path = data_utils.get_file(
- path,
- origin: Path.Combine(origin_folder, "imdb.npz"),
- file_hash: "69664113be75683a8fe16e3ed0ab59fda8886cb3cd7ada244f7d9544e4676b9f"
- );
- path = Path.Combine(path, "imdb.npz");
- var fileBytes = File.ReadAllBytes(path);
- var (x_train, x_test) = LoadX(fileBytes);
- var (labels_train, labels_test) = LoadY(fileBytes);
-
- var indices = np.arange<int>(len(x_train));
- np.random.shuffle(indices, seed);
- x_train = x_train[indices];
- labels_train = labels_train[indices];
-
- indices = np.arange<int>(len(x_test));
- np.random.shuffle(indices, seed);
- x_test = x_test[indices];
- labels_test = labels_test[indices];
-
- var x_train_array = (int[,])x_train.ToMultiDimArray<int>();
- var x_test_array = (int[,])x_test.ToMultiDimArray<int>();
- var labels_train_array = (long[])labels_train.ToArray<long>();
- var labels_test_array = (long[])labels_test.ToArray<long>();
-
- if (start_char != null)
- {
- int[,] new_x_train_array = new int[x_train_array.GetLength(0), x_train_array.GetLength(1) + 1];
- for (var i = 0; i < x_train_array.GetLength(0); i++)
- {
- new_x_train_array[i, 0] = (int)start_char;
- Array.Copy(x_train_array, i * x_train_array.GetLength(1), new_x_train_array, i * new_x_train_array.GetLength(1) + 1, x_train_array.GetLength(1));
- }
- int[,] new_x_test_array = new int[x_test_array.GetLength(0), x_test_array.GetLength(1) + 1];
- for (var i = 0; i < x_test_array.GetLength(0); i++)
- {
- new_x_test_array[i, 0] = (int)start_char;
- Array.Copy(x_test_array, i * x_test_array.GetLength(1), new_x_test_array, i * new_x_test_array.GetLength(1) + 1, x_test_array.GetLength(1));
- }
- x_train_array = new_x_train_array;
- x_test_array = new_x_test_array;
- }
- else if (index_from != 0)
- {
- for (var i = 0; i < x_train_array.GetLength(0); i++)
- {
- for (var j = 0; j < x_train_array.GetLength(1); j++)
- {
- if (x_train_array[i, j] == 0)
- break;
- x_train_array[i, j] += index_from;
- }
- }
- for (var i = 0; i < x_test_array.GetLength(0); i++)
- {
- for (var j = 0; j < x_test_array.GetLength(1); j++)
- {
- if (x_test_array[i, j] == 0)
- break;
- x_test[i, j] += index_from;
- }
- }
- }
-
- if (maxlen == null)
- {
- maxlen = max(x_train_array.GetLength(1), x_test_array.GetLength(1));
- }
- (x_train_array, labels_train_array) = data_utils._remove_long_seq((int)maxlen, x_train_array, labels_train_array);
- (x_test_array, labels_test_array) = data_utils._remove_long_seq((int)maxlen, x_test_array, labels_test_array);
- if (x_train_array.Length == 0 || x_test_array.Length == 0)
- throw new ValueError("After filtering for sequences shorter than maxlen=" +
- $"{maxlen}, no sequence was kept. Increase maxlen.");
-
- int[,] xs_array = new int[x_train_array.GetLength(0) + x_test_array.GetLength(0), (int)maxlen];
- Array.Copy(x_train_array, xs_array, x_train_array.Length);
- Array.Copy(x_test_array, 0, xs_array, x_train_array.Length, x_train_array.Length);
-
- long[] labels_array = new long[labels_train_array.Length + labels_test_array.Length];
- Array.Copy(labels_train_array, labels_array, labels_train_array.Length);
- Array.Copy(labels_test_array, 0, labels_array, labels_train_array.Length, labels_test_array.Length);
-
- if (num_words == null)
- {
- num_words = 0;
- for (var i = 0; i < xs_array.GetLength(0); i++)
- for (var j = 0; j < xs_array.GetLength(1); j++)
- num_words = max((int)num_words, (int)xs_array[i, j]);
- }
-
- // by convention, use 2 as OOV word
- // reserve 'index_from' (=3 by default) characters:
- // 0 (padding), 1 (start), 2 (OOV)
- if (oov_char != null)
- {
- int[,] new_xs_array = new int[xs_array.GetLength(0), xs_array.GetLength(1)];
- for (var i = 0; i < xs_array.GetLength(0); i++)
- {
- for (var j = 0; j < xs_array.GetLength(1); j++)
- {
- if (xs_array[i, j] == 0 || skip_top <= xs_array[i, j] && xs_array[i, j] < num_words)
- new_xs_array[i, j] = xs_array[i, j];
- else
- new_xs_array[i, j] = (int)oov_char;
- }
- }
- xs_array = new_xs_array;
- }
- else
- {
- int[,] new_xs_array = new int[xs_array.GetLength(0), xs_array.GetLength(1)];
- for (var i = 0; i < xs_array.GetLength(0); i++)
- {
- int k = 0;
- for (var j = 0; j < xs_array.GetLength(1); j++)
- {
- if (xs_array[i, j] == 0 || skip_top <= xs_array[i, j] && xs_array[i, j] < num_words)
- new_xs_array[i, k++] = xs_array[i, j];
- }
- }
- xs_array = new_xs_array;
- }
-
- Array.Copy(xs_array, x_train_array, x_train_array.Length);
- Array.Copy(xs_array, x_train_array.Length, x_test_array, 0, x_train_array.Length);
-
- Array.Copy(labels_array, labels_train_array, labels_train_array.Length);
- Array.Copy(labels_array, labels_train_array.Length, labels_test_array, 0, labels_test_array.Length);
-
- return new DatasetPass
- {
- Train = (x_train_array, labels_train_array),
- Test = (x_test_array, labels_test_array)
- };
- }
-
- (NDArray, NDArray) LoadX(byte[] bytes)
- {
- var x = np.Load_Npz<int[,]>(bytes);
- return (x["x_train.npy"], x["x_test.npy"]);
- }
-
- (NDArray, NDArray) LoadY(byte[] bytes)
- {
- var y = np.Load_Npz<long[]>(bytes);
- return (y["y_train.npy"], y["y_test.npy"]);
- }
- }
- }
|