add: loading pickled npy file for imdb dataset loadertags/v0.110.4-Transformer-Model
@@ -4,6 +4,8 @@ using System.IO; | |||||
using System.Linq; | using System.Linq; | ||||
using System.Text; | using System.Text; | ||||
using Tensorflow.Util; | using Tensorflow.Util; | ||||
using Razorvine.Pickle; | |||||
using Tensorflow.NumPy.Pickle; | |||||
using static Tensorflow.Binding; | using static Tensorflow.Binding; | ||||
namespace Tensorflow.NumPy | namespace Tensorflow.NumPy | ||||
@@ -97,6 +99,13 @@ namespace Tensorflow.NumPy | |||||
return matrix; | return matrix; | ||||
} | } | ||||
Array ReadObjectMatrix(BinaryReader reader, Array matrix, int[] shape) | |||||
{ | |||||
Stream stream = reader.BaseStream; | |||||
var unpickler = new Unpickler(); | |||||
return (MultiArrayPickleWarpper)unpickler.load(stream); | |||||
} | |||||
public (NDArray, NDArray) meshgrid<T>(T[] array, bool copy = true, bool sparse = false) | public (NDArray, NDArray) meshgrid<T>(T[] array, bool copy = true, bool sparse = false) | ||||
{ | { | ||||
var tensors = array_ops.meshgrid(array, copy: copy, sparse: sparse); | var tensors = array_ops.meshgrid(array, copy: copy, sparse: sparse); | ||||
@@ -27,8 +27,14 @@ namespace Tensorflow.NumPy | |||||
Array matrix = Array.CreateInstance(type, shape); | Array matrix = Array.CreateInstance(type, shape); | ||||
//if (type == typeof(String)) | //if (type == typeof(String)) | ||||
//return ReadStringMatrix(reader, matrix, bytes, type, shape); | |||||
return ReadValueMatrix(reader, matrix, bytes, type, shape); | |||||
//return ReadStringMatrix(reader, matrix, bytes, type, shape); | |||||
if (type == typeof(Object)) | |||||
return ReadObjectMatrix(reader, matrix, shape); | |||||
else | |||||
{ | |||||
return ReadValueMatrix(reader, matrix, bytes, type, shape); | |||||
} | |||||
} | } | ||||
} | } | ||||
@@ -37,7 +43,7 @@ namespace Tensorflow.NumPy | |||||
ICloneable, IList, ICollection, IEnumerable, IStructuralComparable, IStructuralEquatable | ICloneable, IList, ICollection, IEnumerable, IStructuralComparable, IStructuralEquatable | ||||
{ | { | ||||
// if (typeof(T).IsArray && (typeof(T).GetElementType().IsArray || typeof(T).GetElementType() == typeof(string))) | // if (typeof(T).IsArray && (typeof(T).GetElementType().IsArray || typeof(T).GetElementType() == typeof(string))) | ||||
// return LoadJagged(stream) as T; | |||||
// return LoadJagged(stream) as T; | |||||
return LoadMatrix(stream) as T; | return LoadMatrix(stream) as T; | ||||
} | } | ||||
@@ -93,7 +99,7 @@ namespace Tensorflow.NumPy | |||||
Type GetType(string dtype, out int bytes, out bool? isLittleEndian) | Type GetType(string dtype, out int bytes, out bool? isLittleEndian) | ||||
{ | { | ||||
isLittleEndian = IsLittleEndian(dtype); | isLittleEndian = IsLittleEndian(dtype); | ||||
bytes = Int32.Parse(dtype.Substring(2)); | |||||
bytes = dtype.Length > 2 ? Int32.Parse(dtype.Substring(2)) : 0; | |||||
string typeCode = dtype.Substring(1); | string typeCode = dtype.Substring(1); | ||||
@@ -121,6 +127,8 @@ namespace Tensorflow.NumPy | |||||
return typeof(Double); | return typeof(Double); | ||||
if (typeCode.StartsWith("S")) | if (typeCode.StartsWith("S")) | ||||
return typeof(String); | return typeof(String); | ||||
if (typeCode.StartsWith("O")) | |||||
return typeof(Object); | |||||
throw new NotSupportedException(); | throw new NotSupportedException(); | ||||
} | } | ||||
@@ -14,9 +14,9 @@ namespace Tensorflow.NumPy | |||||
public NDArray permutation(NDArray x) => new NDArray(random_ops.random_shuffle(x)); | public NDArray permutation(NDArray x) => new NDArray(random_ops.random_shuffle(x)); | ||||
[AutoNumPy] | [AutoNumPy] | ||||
public void shuffle(NDArray x) | |||||
public void shuffle(NDArray x, int? seed = null) | |||||
{ | { | ||||
var y = random_ops.random_shuffle(x); | |||||
var y = random_ops.random_shuffle(x, seed); | |||||
Marshal.Copy(y.BufferToArray(), 0, x.TensorDataPointer, (int)x.bytesize); | Marshal.Copy(y.BufferToArray(), 0, x.TensorDataPointer, (int)x.bytesize); | ||||
} | } | ||||
@@ -10,6 +10,7 @@ namespace Tensorflow.NumPy | |||||
public unsafe static T Scalar<T>(NDArray nd) where T : unmanaged | public unsafe static T Scalar<T>(NDArray nd) where T : unmanaged | ||||
=> nd.dtype switch | => nd.dtype switch | ||||
{ | { | ||||
TF_DataType.TF_BOOL => Scalar<T>(*(bool*)nd.data), | |||||
TF_DataType.TF_UINT8 => Scalar<T>(*(byte*)nd.data), | TF_DataType.TF_UINT8 => Scalar<T>(*(byte*)nd.data), | ||||
TF_DataType.TF_FLOAT => Scalar<T>(*(float*)nd.data), | TF_DataType.TF_FLOAT => Scalar<T>(*(float*)nd.data), | ||||
TF_DataType.TF_INT32 => Scalar<T>(*(int*)nd.data), | TF_DataType.TF_INT32 => Scalar<T>(*(int*)nd.data), | ||||
@@ -0,0 +1,20 @@ | |||||
using System; | |||||
using System.Collections.Generic; | |||||
using System.Text; | |||||
namespace Tensorflow.NumPy.Pickle | |||||
{ | |||||
public class DTypePickleWarpper | |||||
{ | |||||
TF_DataType dtype { get; set; } | |||||
public DTypePickleWarpper(TF_DataType dtype) | |||||
{ | |||||
this.dtype = dtype; | |||||
} | |||||
public void __setstate__(object[] args) { } | |||||
public static implicit operator TF_DataType(DTypePickleWarpper dTypeWarpper) | |||||
{ | |||||
return dTypeWarpper.dtype; | |||||
} | |||||
} | |||||
} |
@@ -0,0 +1,52 @@ | |||||
using System; | |||||
using System.Collections.Generic; | |||||
using System.Diagnostics.CodeAnalysis; | |||||
using System.Text; | |||||
using Razorvine.Pickle; | |||||
namespace Tensorflow.NumPy.Pickle | |||||
{ | |||||
/// <summary> | |||||
/// | |||||
/// </summary> | |||||
[SuppressMessage("ReSharper", "InconsistentNaming")] | |||||
[SuppressMessage("ReSharper", "MemberCanBePrivate.Global")] | |||||
[SuppressMessage("ReSharper", "MemberCanBeMadeStatic.Global")] | |||||
class DtypeConstructor : IObjectConstructor | |||||
{ | |||||
public object construct(object[] args) | |||||
{ | |||||
var typeCode = (string)args[0]; | |||||
TF_DataType dtype; | |||||
if (typeCode == "b1") | |||||
dtype = np.@bool; | |||||
else if (typeCode == "i1") | |||||
dtype = np.@byte; | |||||
else if (typeCode == "i2") | |||||
dtype = np.int16; | |||||
else if (typeCode == "i4") | |||||
dtype = np.int32; | |||||
else if (typeCode == "i8") | |||||
dtype = np.int64; | |||||
else if (typeCode == "u1") | |||||
dtype = np.ubyte; | |||||
else if (typeCode == "u2") | |||||
dtype = np.uint16; | |||||
else if (typeCode == "u4") | |||||
dtype = np.uint32; | |||||
else if (typeCode == "u8") | |||||
dtype = np.uint64; | |||||
else if (typeCode == "f4") | |||||
dtype = np.float32; | |||||
else if (typeCode == "f8") | |||||
dtype = np.float64; | |||||
else if (typeCode.StartsWith("S")) | |||||
dtype = np.@string; | |||||
else if (typeCode.StartsWith("O")) | |||||
dtype = np.@object; | |||||
else | |||||
throw new NotSupportedException(); | |||||
return new DTypePickleWarpper(dtype); | |||||
} | |||||
} | |||||
} |
@@ -0,0 +1,53 @@ | |||||
using System; | |||||
using System.Collections.Generic; | |||||
using System.Diagnostics.CodeAnalysis; | |||||
using System.Text; | |||||
using Razorvine.Pickle; | |||||
using Razorvine.Pickle.Objects; | |||||
namespace Tensorflow.NumPy.Pickle | |||||
{ | |||||
/// <summary> | |||||
/// Creates multiarrays of objects. Returns a primitive type multiarray such as int[][] if | |||||
/// the objects are ints, etc. | |||||
/// </summary> | |||||
[SuppressMessage("ReSharper", "InconsistentNaming")] | |||||
[SuppressMessage("ReSharper", "MemberCanBePrivate.Global")] | |||||
[SuppressMessage("ReSharper", "MemberCanBeMadeStatic.Global")] | |||||
public class MultiArrayConstructor : IObjectConstructor | |||||
{ | |||||
public object construct(object[] args) | |||||
{ | |||||
if (args.Length != 3) | |||||
throw new InvalidArgumentError($"Invalid number of arguments in MultiArrayConstructor._reconstruct. Expected three arguments. Given {args.Length} arguments."); | |||||
var types = (ClassDictConstructor)args[0]; | |||||
if (types.module != "numpy" || types.name != "ndarray") | |||||
throw new RuntimeError("_reconstruct: First argument must be a sub-type of ndarray"); | |||||
var arg1 = (object[])args[1]; | |||||
var dims = new int[arg1.Length]; | |||||
for (var i = 0; i < arg1.Length; i++) | |||||
{ | |||||
dims[i] = (int)arg1[i]; | |||||
} | |||||
var shape = new Shape(dims); | |||||
TF_DataType dtype; | |||||
string identifier; | |||||
if (args[2].GetType() == typeof(string)) | |||||
identifier = (string)args[2]; | |||||
else | |||||
identifier = Encoding.UTF8.GetString((byte[])args[2]); | |||||
switch (identifier) | |||||
{ | |||||
case "u": dtype = np.uint32; break; | |||||
case "c": dtype = np.complex_; break; | |||||
case "f": dtype = np.float32; break; | |||||
case "b": dtype = np.@bool; break; | |||||
default: throw new NotImplementedException($"Unsupported data type: {args[2]}"); | |||||
} | |||||
return new MultiArrayPickleWarpper(shape, dtype); | |||||
} | |||||
} | |||||
} |
@@ -0,0 +1,119 @@ | |||||
using Newtonsoft.Json.Linq; | |||||
using Serilog.Debugging; | |||||
using System; | |||||
using System.Collections; | |||||
using System.Collections.Generic; | |||||
using System.Text; | |||||
namespace Tensorflow.NumPy.Pickle | |||||
{ | |||||
public class MultiArrayPickleWarpper | |||||
{ | |||||
public Shape reconstructedShape { get; set; } | |||||
public TF_DataType reconstructedDType { get; set; } | |||||
public NDArray reconstructedNDArray { get; set; } | |||||
public Array reconstructedMultiArray { get; set; } | |||||
public MultiArrayPickleWarpper(Shape shape, TF_DataType dtype) | |||||
{ | |||||
reconstructedShape = shape; | |||||
reconstructedDType = dtype; | |||||
} | |||||
public void __setstate__(object[] args) | |||||
{ | |||||
if (args.Length != 5) | |||||
throw new InvalidArgumentError($"Invalid number of arguments in NDArray.__setstate__. Expected five arguments. Given {args.Length} arguments."); | |||||
var version = (int)args[0]; // version | |||||
var arg1 = (object[])args[1]; | |||||
var dims = new int[arg1.Length]; | |||||
for (var i = 0; i < arg1.Length; i++) | |||||
{ | |||||
dims[i] = (int)arg1[i]; | |||||
} | |||||
var _ShapeLike = new Shape(dims); // shape | |||||
TF_DataType _DType_co = (DTypePickleWarpper)args[2]; // DType | |||||
var F_continuous = (bool)args[3]; // F-continuous | |||||
if (F_continuous) | |||||
throw new InvalidArgumentError("Fortran Continuous memory layout is not supported. Please use C-continuous layout or check the data format."); | |||||
var data = args[4]; // Data | |||||
/* | |||||
* If we ever need another pickle format, increment the version | |||||
* number. But we should still be able to handle the old versions. | |||||
*/ | |||||
if (version < 0 || version > 4) | |||||
throw new ValueError($"can't handle version {version} of numpy.dtype pickle"); | |||||
// TODO: Implement the missing details and checks from the official Numpy C code here. | |||||
// https://github.com/numpy/numpy/blob/2f0bd6e86a77e4401d0384d9a75edf9470c5deb6/numpy/core/src/multiarray/descriptor.c#L2761 | |||||
if (data.GetType() == typeof(ArrayList)) | |||||
{ | |||||
Reconstruct((ArrayList)data); | |||||
} | |||||
else | |||||
throw new NotImplementedException(""); | |||||
} | |||||
private void Reconstruct(ArrayList arrayList) | |||||
{ | |||||
int ndim = 1; | |||||
var subArrayList = arrayList; | |||||
while (subArrayList.Count > 0 && subArrayList[0] != null && subArrayList[0].GetType() == typeof(ArrayList)) | |||||
{ | |||||
subArrayList = (ArrayList)subArrayList[0]; | |||||
ndim += 1; | |||||
} | |||||
var type = subArrayList[0].GetType(); | |||||
if (type == typeof(int)) | |||||
{ | |||||
if (ndim == 1) | |||||
{ | |||||
int[] list = (int[])arrayList.ToArray(typeof(int)); | |||||
Shape shape = new Shape(new int[] { arrayList.Count }); | |||||
reconstructedMultiArray = list; | |||||
reconstructedNDArray = new NDArray(list, shape); | |||||
} | |||||
if (ndim == 2) | |||||
{ | |||||
int secondDim = 0; | |||||
foreach (ArrayList subArray in arrayList) | |||||
{ | |||||
secondDim = subArray.Count > secondDim ? subArray.Count : secondDim; | |||||
} | |||||
int[,] list = new int[arrayList.Count, secondDim]; | |||||
for (int i = 0; i < arrayList.Count; i++) | |||||
{ | |||||
var subArray = (ArrayList?)arrayList[i]; | |||||
if (subArray == null) | |||||
throw new NullReferenceException(""); | |||||
for (int j = 0; j < subArray.Count; j++) | |||||
{ | |||||
var element = subArray[j]; | |||||
if (element == null) | |||||
throw new NoNullAllowedException("the element of ArrayList cannot be null."); | |||||
list[i, j] = (int)element; | |||||
} | |||||
} | |||||
Shape shape = new Shape(new int[] { arrayList.Count, secondDim }); | |||||
reconstructedMultiArray = list; | |||||
reconstructedNDArray = new NDArray(list, shape); | |||||
} | |||||
if (ndim > 2) | |||||
throw new NotImplementedException("can't handle ArrayList with more than two dimensions."); | |||||
} | |||||
else | |||||
throw new NotImplementedException(""); | |||||
} | |||||
public static implicit operator Array(MultiArrayPickleWarpper arrayWarpper) | |||||
{ | |||||
return arrayWarpper.reconstructedMultiArray; | |||||
} | |||||
public static implicit operator NDArray(MultiArrayPickleWarpper arrayWarpper) | |||||
{ | |||||
return arrayWarpper.reconstructedNDArray; | |||||
} | |||||
} | |||||
} |
@@ -43,7 +43,9 @@ public partial class np | |||||
public static readonly TF_DataType @decimal = TF_DataType.TF_DOUBLE; | public static readonly TF_DataType @decimal = TF_DataType.TF_DOUBLE; | ||||
public static readonly TF_DataType complex_ = TF_DataType.TF_COMPLEX; | public static readonly TF_DataType complex_ = TF_DataType.TF_COMPLEX; | ||||
public static readonly TF_DataType complex64 = TF_DataType.TF_COMPLEX64; | public static readonly TF_DataType complex64 = TF_DataType.TF_COMPLEX64; | ||||
public static readonly TF_DataType complex128 = TF_DataType.TF_COMPLEX128; | |||||
public static readonly TF_DataType complex128 = TF_DataType.TF_COMPLEX128; | |||||
public static readonly TF_DataType @string = TF_DataType.TF_STRING; | |||||
public static readonly TF_DataType @object = TF_DataType.TF_VARIANT; | |||||
#endregion | #endregion | ||||
public static double nan => double.NaN; | public static double nan => double.NaN; | ||||
@@ -176,6 +176,7 @@ https://tensorflownet.readthedocs.io</Description> | |||||
<PackageReference Include="Newtonsoft.Json" Version="13.0.3" /> | <PackageReference Include="Newtonsoft.Json" Version="13.0.3" /> | ||||
<PackageReference Include="OneOf" Version="3.0.255" /> | <PackageReference Include="OneOf" Version="3.0.255" /> | ||||
<PackageReference Include="Protobuf.Text" Version="0.7.1" /> | <PackageReference Include="Protobuf.Text" Version="0.7.1" /> | ||||
<PackageReference Include="Razorvine.Pickle" Version="1.4.0" /> | |||||
<PackageReference Include="Serilog.Sinks.Console" Version="4.1.0" /> | <PackageReference Include="Serilog.Sinks.Console" Version="4.1.0" /> | ||||
</ItemGroup> | </ItemGroup> | ||||
@@ -14,6 +14,7 @@ | |||||
limitations under the License. | limitations under the License. | ||||
******************************************************************************/ | ******************************************************************************/ | ||||
using Razorvine.Pickle; | |||||
using Serilog; | using Serilog; | ||||
using Serilog.Core; | using Serilog.Core; | ||||
using System.Reflection; | using System.Reflection; | ||||
@@ -22,6 +23,7 @@ using Tensorflow.Contexts; | |||||
using Tensorflow.Eager; | using Tensorflow.Eager; | ||||
using Tensorflow.Gradients; | using Tensorflow.Gradients; | ||||
using Tensorflow.Keras; | using Tensorflow.Keras; | ||||
using Tensorflow.NumPy.Pickle; | |||||
namespace Tensorflow | namespace Tensorflow | ||||
{ | { | ||||
@@ -98,6 +100,10 @@ namespace Tensorflow | |||||
"please visit https://github.com/SciSharp/TensorFlow.NET. If it still not work after installing the backend, please submit an " + | "please visit https://github.com/SciSharp/TensorFlow.NET. If it still not work after installing the backend, please submit an " + | ||||
"issue to https://github.com/SciSharp/TensorFlow.NET/issues"); | "issue to https://github.com/SciSharp/TensorFlow.NET/issues"); | ||||
} | } | ||||
// register numpy reconstructor for pickle | |||||
Unpickler.registerConstructor("numpy.core.multiarray", "_reconstruct", new MultiArrayConstructor()); | |||||
Unpickler.registerConstructor("numpy", "dtype", new DtypeConstructor()); | |||||
} | } | ||||
public string VERSION => c_api.StringPiece(c_api.TF_Version()); | public string VERSION => c_api.StringPiece(c_api.TF_Version()); | ||||
@@ -3,8 +3,6 @@ using System.Collections.Generic; | |||||
using System.IO; | using System.IO; | ||||
using System.Text; | using System.Text; | ||||
using Tensorflow.Keras.Utils; | using Tensorflow.Keras.Utils; | ||||
using Tensorflow.NumPy; | |||||
using System.Linq; | |||||
namespace Tensorflow.Keras.Datasets | namespace Tensorflow.Keras.Datasets | ||||
{ | { | ||||
@@ -12,11 +10,57 @@ namespace Tensorflow.Keras.Datasets | |||||
/// This is a dataset of 25,000 movies reviews from IMDB, labeled by sentiment | /// This is a dataset of 25,000 movies reviews from IMDB, labeled by sentiment | ||||
/// (positive/negative). Reviews have been preprocessed, and each review is | /// (positive/negative). Reviews have been preprocessed, and each review is | ||||
/// encoded as a list of word indexes(integers). | /// encoded as a list of word indexes(integers). | ||||
/// For convenience, words are indexed by overall frequency in the dataset, | |||||
/// so that for instance the integer "3" encodes the 3rd most frequent word in | |||||
/// the data.This allows for quick filtering operations such as: | |||||
/// "only consider the top 10,000 most | |||||
/// common words, but eliminate the top 20 most common words". | |||||
/// As a convention, "0" does not stand for a specific word, but instead is used | |||||
/// to encode the pad token. | |||||
/// Args: | |||||
/// path: where to cache the data (relative to %TEMP%/imdb/imdb.npz). | |||||
/// num_words: integer or None.Words are | |||||
/// ranked by how often they occur(in the training set) and only | |||||
/// the `num_words` most frequent words are kept.Any less frequent word | |||||
/// will appear as `oov_char` value in the sequence data.If None, | |||||
/// all words are kept.Defaults to `None`. | |||||
/// skip_top: skip the top N most frequently occurring words | |||||
/// (which may not be informative). These words will appear as | |||||
/// `oov_char` value in the dataset.When 0, no words are | |||||
/// skipped. Defaults to `0`. | |||||
/// maxlen: int or None.Maximum sequence length. | |||||
/// Any longer sequence will be truncated. None, means no truncation. | |||||
/// Defaults to `None`. | |||||
/// seed: int. Seed for reproducible data shuffling. | |||||
/// start_char: int. The start of a sequence will be marked with this | |||||
/// character. 0 is usually the padding character. Defaults to `1`. | |||||
/// oov_char: int. The out-of-vocabulary character. | |||||
/// Words that were cut out because of the `num_words` or | |||||
/// `skip_top` limits will be replaced with this character. | |||||
/// index_from: int. Index actual words with this index and higher. | |||||
/// Returns: | |||||
/// Tuple of Numpy arrays: `(x_train, labels_train), (x_test, labels_test)`. | |||||
/// | |||||
/// ** x_train, x_test**: lists of sequences, which are lists of indexes | |||||
/// (integers). If the num_words argument was specific, the maximum | |||||
/// possible index value is `num_words - 1`. If the `maxlen` argument was | |||||
/// specified, the largest possible sequence length is `maxlen`. | |||||
/// | |||||
/// ** labels_train, labels_test**: lists of integer labels(1 or 0). | |||||
/// | |||||
/// Raises: | |||||
/// ValueError: in case `maxlen` is so low | |||||
/// that no input sequence could be kept. | |||||
/// Note that the 'out of vocabulary' character is only used for | |||||
/// words that were present in the training set but are not included | |||||
/// because they're not making the `num_words` cut here. | |||||
/// Words that were not seen in the training set but are in the test set | |||||
/// have simply been skipped. | |||||
/// </summary> | /// </summary> | ||||
/// """Loads the [IMDB dataset](https://ai.stanford.edu/~amaas/data/sentiment/). | |||||
public class Imdb | public class Imdb | ||||
{ | { | ||||
string origin_folder = "https://storage.googleapis.com/tensorflow/tf-keras-datasets/"; | string origin_folder = "https://storage.googleapis.com/tensorflow/tf-keras-datasets/"; | ||||
string file_name = "imdb.npz"; | |||||
string dest_folder = "imdb"; | string dest_folder = "imdb"; | ||||
/// <summary> | /// <summary> | ||||
@@ -31,40 +75,150 @@ namespace Tensorflow.Keras.Datasets | |||||
/// <param name="oov_char"></param> | /// <param name="oov_char"></param> | ||||
/// <param name="index_from"></param> | /// <param name="index_from"></param> | ||||
/// <returns></returns> | /// <returns></returns> | ||||
public DatasetPass load_data(string? path = "imdb.npz", | |||||
int num_words = -1, | |||||
public DatasetPass load_data( | |||||
string path = "imdb.npz", | |||||
int? num_words = null, | |||||
int skip_top = 0, | int skip_top = 0, | ||||
int maxlen = -1, | |||||
int? maxlen = null, | |||||
int seed = 113, | int seed = 113, | ||||
int start_char = 1, | |||||
int oov_char= 2, | |||||
int? start_char = 1, | |||||
int? oov_char = 2, | |||||
int index_from = 3) | int index_from = 3) | ||||
{ | { | ||||
if (maxlen == -1) throw new InvalidArgumentError("maxlen must be assigned."); | |||||
var dst = path ?? Download(); | |||||
path = data_utils.get_file( | |||||
path, | |||||
origin: Path.Combine(origin_folder, "imdb.npz"), | |||||
file_hash: "69664113be75683a8fe16e3ed0ab59fda8886cb3cd7ada244f7d9544e4676b9f" | |||||
); | |||||
path = Path.Combine(path, "imdb.npz"); | |||||
var fileBytes = File.ReadAllBytes(path); | |||||
var (x_train, x_test) = LoadX(fileBytes); | |||||
var (labels_train, labels_test) = LoadY(fileBytes); | |||||
var lines = File.ReadAllLines(Path.Combine(dst, "imdb_train.txt")); | |||||
var x_train_string = new string[lines.Length]; | |||||
var y_train = np.zeros(new int[] { lines.Length }, np.int64); | |||||
for (int i = 0; i < lines.Length; i++) | |||||
var indices = np.arange<int>(len(x_train)); | |||||
np.random.shuffle(indices, seed); | |||||
x_train = x_train[indices]; | |||||
labels_train = labels_train[indices]; | |||||
indices = np.arange<int>(len(x_test)); | |||||
np.random.shuffle(indices, seed); | |||||
x_test = x_test[indices]; | |||||
labels_test = labels_test[indices]; | |||||
var x_train_array = (int[,])x_train.ToMultiDimArray<int>(); | |||||
var x_test_array = (int[,])x_test.ToMultiDimArray<int>(); | |||||
var labels_train_array = (long[])labels_train.ToArray<long>(); | |||||
var labels_test_array = (long[])labels_test.ToArray<long>(); | |||||
if (start_char != null) | |||||
{ | { | ||||
y_train[i] = long.Parse(lines[i].Substring(0, 1)); | |||||
x_train_string[i] = lines[i].Substring(2); | |||||
int[,] new_x_train_array = new int[x_train_array.GetLength(0), x_train_array.GetLength(1) + 1]; | |||||
for (var i = 0; i < x_train_array.GetLength(0); i++) | |||||
{ | |||||
new_x_train_array[i, 0] = (int)start_char; | |||||
for (var j = 0; j < x_train_array.GetLength(1); j++) | |||||
{ | |||||
if (x_train_array[i, j] == 0) | |||||
break; | |||||
new_x_train_array[i, j + 1] = x_train_array[i, j]; | |||||
} | |||||
} | |||||
int[,] new_x_test_array = new int[x_test_array.GetLength(0), x_test_array.GetLength(1) + 1]; | |||||
for (var i = 0; i < x_test_array.GetLength(0); i++) | |||||
{ | |||||
new_x_test_array[i, 0] = (int)start_char; | |||||
for (var j = 0; j < x_test_array.GetLength(1); j++) | |||||
{ | |||||
if (x_test_array[i, j] == 0) | |||||
break; | |||||
new_x_test_array[i, j + 1] = x_test_array[i, j]; | |||||
} | |||||
} | |||||
x_train_array = new_x_train_array; | |||||
x_test_array = new_x_test_array; | |||||
} | |||||
else if (index_from != 0) | |||||
{ | |||||
for (var i = 0; i < x_train_array.GetLength(0); i++) | |||||
{ | |||||
for (var j = 0; j < x_train_array.GetLength(1); j++) | |||||
{ | |||||
if (x_train_array[i, j] == 0) | |||||
break; | |||||
x_train_array[i, j] += index_from; | |||||
} | |||||
} | |||||
for (var i = 0; i < x_test_array.GetLength(0); i++) | |||||
{ | |||||
for (var j = 0; j < x_test_array.GetLength(1); j++) | |||||
{ | |||||
if (x_test_array[i, j] == 0) | |||||
break; | |||||
x_test[i, j] += index_from; | |||||
} | |||||
} | |||||
} | } | ||||
var x_train = keras.preprocessing.sequence.pad_sequences(PraseData(x_train_string), maxlen: maxlen); | |||||
if (maxlen == null) | |||||
{ | |||||
maxlen = max(x_train_array.GetLength(1), x_test_array.GetLength(1)); | |||||
} | |||||
(x_train, labels_train) = data_utils._remove_long_seq((int)maxlen, x_train_array, labels_train_array); | |||||
(x_test, labels_test) = data_utils._remove_long_seq((int)maxlen, x_test_array, labels_test_array); | |||||
if (x_train.size == 0 || x_test.size == 0) | |||||
throw new ValueError("After filtering for sequences shorter than maxlen=" + | |||||
$"{maxlen}, no sequence was kept. Increase maxlen."); | |||||
lines = File.ReadAllLines(Path.Combine(dst, "imdb_test.txt")); | |||||
var x_test_string = new string[lines.Length]; | |||||
var y_test = np.zeros(new int[] { lines.Length }, np.int64); | |||||
for (int i = 0; i < lines.Length; i++) | |||||
var xs = np.concatenate(new[] { x_train, x_test }); | |||||
var labels = np.concatenate(new[] { labels_train, labels_test }); | |||||
var xs_array = (int[,])xs.ToMultiDimArray<int>(); | |||||
if (num_words == null) | |||||
{ | { | ||||
y_test[i] = long.Parse(lines[i].Substring(0, 1)); | |||||
x_test_string[i] = lines[i].Substring(2); | |||||
num_words = 0; | |||||
for (var i = 0; i < xs_array.GetLength(0); i++) | |||||
for (var j = 0; j < xs_array.GetLength(1); j++) | |||||
num_words = max((int)num_words, (int)xs_array[i, j]); | |||||
} | } | ||||
var x_test = keras.preprocessing.sequence.pad_sequences(PraseData(x_test_string), maxlen: maxlen); | |||||
// by convention, use 2 as OOV word | |||||
// reserve 'index_from' (=3 by default) characters: | |||||
// 0 (padding), 1 (start), 2 (OOV) | |||||
if (oov_char != null) | |||||
{ | |||||
int[,] new_xs_array = new int[xs_array.GetLength(0), xs_array.GetLength(1)]; | |||||
for (var i = 0; i < xs_array.GetLength(0); i++) | |||||
{ | |||||
for (var j = 0; j < xs_array.GetLength(1); j++) | |||||
{ | |||||
if (xs_array[i, j] == 0 || skip_top <= xs_array[i, j] && xs_array[i, j] < num_words) | |||||
new_xs_array[i, j] = xs_array[i, j]; | |||||
else | |||||
new_xs_array[i, j] = (int)oov_char; | |||||
} | |||||
} | |||||
xs = new NDArray(new_xs_array); | |||||
} | |||||
else | |||||
{ | |||||
int[,] new_xs_array = new int[xs_array.GetLength(0), xs_array.GetLength(1)]; | |||||
for (var i = 0; i < xs_array.GetLength(0); i++) | |||||
{ | |||||
int k = 0; | |||||
for (var j = 0; j < xs_array.GetLength(1); j++) | |||||
{ | |||||
if (xs_array[i, j] == 0 || skip_top <= xs_array[i, j] && xs_array[i, j] < num_words) | |||||
new_xs_array[i, k++] = xs_array[i, j]; | |||||
} | |||||
} | |||||
xs = new NDArray(new_xs_array); | |||||
} | |||||
var idx = len(x_train); | |||||
x_train = xs[$"0:{idx}"]; | |||||
x_test = xs[$"{idx}:"]; | |||||
var y_train = labels[$"0:{idx}"]; | |||||
var y_test = labels[$"{idx}:"]; | |||||
return new DatasetPass | return new DatasetPass | ||||
{ | { | ||||
@@ -75,8 +229,8 @@ namespace Tensorflow.Keras.Datasets | |||||
(NDArray, NDArray) LoadX(byte[] bytes) | (NDArray, NDArray) LoadX(byte[] bytes) | ||||
{ | { | ||||
var y = np.Load_Npz<byte[]>(bytes); | |||||
return (y["x_train.npy"], y["x_test.npy"]); | |||||
var x = np.Load_Npz<int[,]>(bytes); | |||||
return (x["x_train.npy"], x["x_test.npy"]); | |||||
} | } | ||||
(NDArray, NDArray) LoadY(byte[] bytes) | (NDArray, NDArray) LoadY(byte[] bytes) | ||||
@@ -84,34 +238,5 @@ namespace Tensorflow.Keras.Datasets | |||||
var y = np.Load_Npz<long[]>(bytes); | var y = np.Load_Npz<long[]>(bytes); | ||||
return (y["y_train.npy"], y["y_test.npy"]); | return (y["y_train.npy"], y["y_test.npy"]); | ||||
} | } | ||||
string Download() | |||||
{ | |||||
var dst = Path.Combine(Path.GetTempPath(), dest_folder); | |||||
Directory.CreateDirectory(dst); | |||||
Web.Download(origin_folder + file_name, dst, file_name); | |||||
return dst; | |||||
// return Path.Combine(dst, file_name); | |||||
} | |||||
protected IEnumerable<int[]> PraseData(string[] x) | |||||
{ | |||||
var data_list = new List<int[]>(); | |||||
for (int i = 0; i < len(x); i++) | |||||
{ | |||||
var list_string = x[i]; | |||||
var cleaned_list_string = list_string.Replace("[", "").Replace("]", "").Replace(" ", ""); | |||||
string[] number_strings = cleaned_list_string.Split(','); | |||||
int[] numbers = new int[number_strings.Length]; | |||||
for (int j = 0; j < number_strings.Length; j++) | |||||
{ | |||||
numbers[j] = int.Parse(number_strings[j]); | |||||
} | |||||
data_list.Add(numbers); | |||||
} | |||||
return data_list; | |||||
} | |||||
} | } | ||||
} | } |
@@ -39,5 +39,54 @@ namespace Tensorflow.Keras.Utils | |||||
return datadir; | return datadir; | ||||
} | } | ||||
public static (NDArray, NDArray) _remove_long_seq(int maxlen, NDArray seq, NDArray label) | |||||
{ | |||||
/*Removes sequences that exceed the maximum length. | |||||
Args: | |||||
maxlen: Int, maximum length of the output sequences. | |||||
seq: List of lists, where each sublist is a sequence. | |||||
label: List where each element is an integer. | |||||
Returns: | |||||
new_seq, new_label: shortened lists for `seq` and `label`. | |||||
*/ | |||||
List<int[]> new_seq = new List<int[]>(); | |||||
List<long> new_label = new List<long>(); | |||||
var seq_array = (int[,])seq.ToMultiDimArray<int>(); | |||||
var label_array = (long[])label.ToArray<long>(); | |||||
for (var i = 0; i < seq_array.GetLength(0); i++) | |||||
{ | |||||
if (maxlen < seq_array.GetLength(1) && seq_array[i,maxlen] != 0) | |||||
continue; | |||||
int[] sentence = new int[maxlen]; | |||||
for (var j = 0; j < maxlen && j < seq_array.GetLength(1); j++) | |||||
{ | |||||
sentence[j] = seq_array[i, j]; | |||||
} | |||||
new_seq.Add(sentence); | |||||
new_label.Add(label_array[i]); | |||||
} | |||||
int[,] new_seq_array = new int[new_seq.Count, maxlen]; | |||||
long[] new_label_array = new long[new_label.Count]; | |||||
for (var i = 0; i < new_seq.Count; i++) | |||||
{ | |||||
for (var j = 0; j < maxlen; j++) | |||||
{ | |||||
new_seq_array[i, j] = new_seq[i][j]; | |||||
} | |||||
} | |||||
for (var i = 0; i < new_label.Count; i++) | |||||
{ | |||||
new_label_array[i] = new_label[i]; | |||||
} | |||||
return (new_seq_array, new_label_array); | |||||
} | |||||
} | } | ||||
} | } |
@@ -1,7 +1,10 @@ | |||||
using Microsoft.VisualStudio.TestTools.UnitTesting; | using Microsoft.VisualStudio.TestTools.UnitTesting; | ||||
using System; | using System; | ||||
using System.Collections.Generic; | |||||
using System.Linq; | using System.Linq; | ||||
using Tensorflow.NumPy; | |||||
using static Tensorflow.Binding; | using static Tensorflow.Binding; | ||||
using static Tensorflow.KerasApi; | |||||
namespace TensorFlowNET.UnitTest.Dataset | namespace TensorFlowNET.UnitTest.Dataset | ||||
{ | { | ||||
@@ -195,5 +198,40 @@ namespace TensorFlowNET.UnitTest.Dataset | |||||
Assert.IsFalse(allEqual); | Assert.IsFalse(allEqual); | ||||
} | } | ||||
[Ignore] | |||||
[TestMethod] | |||||
public void GetData() | |||||
{ | |||||
var vocab_size = 20000; // Only consider the top 20k words | |||||
var maxlen = 200; // Only consider the first 200 words of each movie review | |||||
var dataset = keras.datasets.imdb.load_data(num_words: vocab_size, maxlen: maxlen); | |||||
var x_train = dataset.Train.Item1; | |||||
var y_train = dataset.Train.Item2; | |||||
var x_val = dataset.Test.Item1; | |||||
var y_val = dataset.Test.Item2; | |||||
x_train = keras.preprocessing.sequence.pad_sequences(RemoveZeros(x_train), maxlen: maxlen); | |||||
x_val = keras.preprocessing.sequence.pad_sequences(RemoveZeros(x_val), maxlen: maxlen); | |||||
print(len(x_train) + " Training sequences"); | |||||
print(len(x_val) + " Validation sequences"); | |||||
} | |||||
IEnumerable<int[]> RemoveZeros(NDArray data) | |||||
{ | |||||
var data_array = (int[,])data.ToMultiDimArray<int>(); | |||||
List<int[]> new_data = new List<int[]>(); | |||||
for (var i = 0; i < data_array.GetLength(0); i++) | |||||
{ | |||||
List<int> new_array = new List<int>(); | |||||
for (var j = 0; j < data_array.GetLength(1); j++) | |||||
{ | |||||
if (data_array[i, j] == 0) | |||||
break; | |||||
else | |||||
new_array.Add(data_array[i, j]); | |||||
} | |||||
new_data.Add(new_array.ToArray()); | |||||
} | |||||
return new_data; | |||||
} | |||||
} | } | ||||
} | } |