Browse Source

Merge pull request #1169 from lingbai-kong/ndarrayload

add: loading pickled npy file for imdb dataset loader
tags/v0.110.4-Transformer-Model
Haiping GitHub 2 years ago
parent
commit
179c3f0216
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 546 additions and 63 deletions
  1. +9
    -0
      src/TensorFlowNET.Core/NumPy/Implementation/NumPyImpl.Creation.cs
  2. +12
    -4
      src/TensorFlowNET.Core/NumPy/Implementation/NumPyImpl.load.cs
  3. +2
    -2
      src/TensorFlowNET.Core/NumPy/Implementation/RandomizedImpl.cs
  4. +1
    -0
      src/TensorFlowNET.Core/NumPy/NDArrayConverter.cs
  5. +20
    -0
      src/TensorFlowNET.Core/NumPy/Pickle/DTypePickleWarpper.cs
  6. +52
    -0
      src/TensorFlowNET.Core/NumPy/Pickle/DtypeConstructor.cs
  7. +53
    -0
      src/TensorFlowNET.Core/NumPy/Pickle/MultiArrayConstructor.cs
  8. +119
    -0
      src/TensorFlowNET.Core/NumPy/Pickle/MultiArrayPickleWarpper.cs
  9. +3
    -1
      src/TensorFlowNET.Core/Numpy/Numpy.cs
  10. +1
    -0
      src/TensorFlowNET.Core/Tensorflow.Binding.csproj
  11. +6
    -0
      src/TensorFlowNET.Core/tensorflow.cs
  12. +181
    -56
      src/TensorFlowNET.Keras/Datasets/Imdb.cs
  13. +49
    -0
      src/TensorFlowNET.Keras/Utils/data_utils.cs
  14. +38
    -0
      test/TensorFlowNET.UnitTest/Dataset/DatasetTest.cs

+ 9
- 0
src/TensorFlowNET.Core/NumPy/Implementation/NumPyImpl.Creation.cs View File

@@ -4,6 +4,8 @@ using System.IO;
using System.Linq;
using System.Text;
using Tensorflow.Util;
using Razorvine.Pickle;
using Tensorflow.NumPy.Pickle;
using static Tensorflow.Binding;

namespace Tensorflow.NumPy
@@ -97,6 +99,13 @@ namespace Tensorflow.NumPy
return matrix;
}

Array ReadObjectMatrix(BinaryReader reader, Array matrix, int[] shape)
{
Stream stream = reader.BaseStream;
var unpickler = new Unpickler();
return (MultiArrayPickleWarpper)unpickler.load(stream);
}

public (NDArray, NDArray) meshgrid<T>(T[] array, bool copy = true, bool sparse = false)
{
var tensors = array_ops.meshgrid(array, copy: copy, sparse: sparse);


+ 12
- 4
src/TensorFlowNET.Core/NumPy/Implementation/NumPyImpl.load.cs View File

@@ -27,8 +27,14 @@ namespace Tensorflow.NumPy
Array matrix = Array.CreateInstance(type, shape);

//if (type == typeof(String))
//return ReadStringMatrix(reader, matrix, bytes, type, shape);
return ReadValueMatrix(reader, matrix, bytes, type, shape);
//return ReadStringMatrix(reader, matrix, bytes, type, shape);

if (type == typeof(Object))
return ReadObjectMatrix(reader, matrix, shape);
else
{
return ReadValueMatrix(reader, matrix, bytes, type, shape);
}
}
}

@@ -37,7 +43,7 @@ namespace Tensorflow.NumPy
ICloneable, IList, ICollection, IEnumerable, IStructuralComparable, IStructuralEquatable
{
// if (typeof(T).IsArray && (typeof(T).GetElementType().IsArray || typeof(T).GetElementType() == typeof(string)))
// return LoadJagged(stream) as T;
// return LoadJagged(stream) as T;
return LoadMatrix(stream) as T;
}

@@ -93,7 +99,7 @@ namespace Tensorflow.NumPy
Type GetType(string dtype, out int bytes, out bool? isLittleEndian)
{
isLittleEndian = IsLittleEndian(dtype);
bytes = Int32.Parse(dtype.Substring(2));
bytes = dtype.Length > 2 ? Int32.Parse(dtype.Substring(2)) : 0;

string typeCode = dtype.Substring(1);

@@ -121,6 +127,8 @@ namespace Tensorflow.NumPy
return typeof(Double);
if (typeCode.StartsWith("S"))
return typeof(String);
if (typeCode.StartsWith("O"))
return typeof(Object);

throw new NotSupportedException();
}


+ 2
- 2
src/TensorFlowNET.Core/NumPy/Implementation/RandomizedImpl.cs View File

@@ -14,9 +14,9 @@ namespace Tensorflow.NumPy
public NDArray permutation(NDArray x) => new NDArray(random_ops.random_shuffle(x));

[AutoNumPy]
public void shuffle(NDArray x)
public void shuffle(NDArray x, int? seed = null)
{
var y = random_ops.random_shuffle(x);
var y = random_ops.random_shuffle(x, seed);
Marshal.Copy(y.BufferToArray(), 0, x.TensorDataPointer, (int)x.bytesize);
}



+ 1
- 0
src/TensorFlowNET.Core/NumPy/NDArrayConverter.cs View File

@@ -10,6 +10,7 @@ namespace Tensorflow.NumPy
public unsafe static T Scalar<T>(NDArray nd) where T : unmanaged
=> nd.dtype switch
{
TF_DataType.TF_BOOL => Scalar<T>(*(bool*)nd.data),
TF_DataType.TF_UINT8 => Scalar<T>(*(byte*)nd.data),
TF_DataType.TF_FLOAT => Scalar<T>(*(float*)nd.data),
TF_DataType.TF_INT32 => Scalar<T>(*(int*)nd.data),


+ 20
- 0
src/TensorFlowNET.Core/NumPy/Pickle/DTypePickleWarpper.cs View File

@@ -0,0 +1,20 @@
using System;
using System.Collections.Generic;
using System.Text;

namespace Tensorflow.NumPy.Pickle
{
public class DTypePickleWarpper
{
TF_DataType dtype { get; set; }
public DTypePickleWarpper(TF_DataType dtype)
{
this.dtype = dtype;
}
public void __setstate__(object[] args) { }
public static implicit operator TF_DataType(DTypePickleWarpper dTypeWarpper)
{
return dTypeWarpper.dtype;
}
}
}

+ 52
- 0
src/TensorFlowNET.Core/NumPy/Pickle/DtypeConstructor.cs View File

@@ -0,0 +1,52 @@
using System;
using System.Collections.Generic;
using System.Diagnostics.CodeAnalysis;
using System.Text;
using Razorvine.Pickle;

namespace Tensorflow.NumPy.Pickle
{
/// <summary>
///
/// </summary>
[SuppressMessage("ReSharper", "InconsistentNaming")]
[SuppressMessage("ReSharper", "MemberCanBePrivate.Global")]
[SuppressMessage("ReSharper", "MemberCanBeMadeStatic.Global")]
class DtypeConstructor : IObjectConstructor
{
public object construct(object[] args)
{
var typeCode = (string)args[0];
TF_DataType dtype;
if (typeCode == "b1")
dtype = np.@bool;
else if (typeCode == "i1")
dtype = np.@byte;
else if (typeCode == "i2")
dtype = np.int16;
else if (typeCode == "i4")
dtype = np.int32;
else if (typeCode == "i8")
dtype = np.int64;
else if (typeCode == "u1")
dtype = np.ubyte;
else if (typeCode == "u2")
dtype = np.uint16;
else if (typeCode == "u4")
dtype = np.uint32;
else if (typeCode == "u8")
dtype = np.uint64;
else if (typeCode == "f4")
dtype = np.float32;
else if (typeCode == "f8")
dtype = np.float64;
else if (typeCode.StartsWith("S"))
dtype = np.@string;
else if (typeCode.StartsWith("O"))
dtype = np.@object;
else
throw new NotSupportedException();
return new DTypePickleWarpper(dtype);
}
}
}

+ 53
- 0
src/TensorFlowNET.Core/NumPy/Pickle/MultiArrayConstructor.cs View File

@@ -0,0 +1,53 @@
using System;
using System.Collections.Generic;
using System.Diagnostics.CodeAnalysis;
using System.Text;
using Razorvine.Pickle;
using Razorvine.Pickle.Objects;

namespace Tensorflow.NumPy.Pickle
{
/// <summary>
/// Creates multiarrays of objects. Returns a primitive type multiarray such as int[][] if
/// the objects are ints, etc.
/// </summary>
[SuppressMessage("ReSharper", "InconsistentNaming")]
[SuppressMessage("ReSharper", "MemberCanBePrivate.Global")]
[SuppressMessage("ReSharper", "MemberCanBeMadeStatic.Global")]
public class MultiArrayConstructor : IObjectConstructor
{
public object construct(object[] args)
{
if (args.Length != 3)
throw new InvalidArgumentError($"Invalid number of arguments in MultiArrayConstructor._reconstruct. Expected three arguments. Given {args.Length} arguments.");

var types = (ClassDictConstructor)args[0];
if (types.module != "numpy" || types.name != "ndarray")
throw new RuntimeError("_reconstruct: First argument must be a sub-type of ndarray");

var arg1 = (object[])args[1];
var dims = new int[arg1.Length];
for (var i = 0; i < arg1.Length; i++)
{
dims[i] = (int)arg1[i];
}
var shape = new Shape(dims);

TF_DataType dtype;
string identifier;
if (args[2].GetType() == typeof(string))
identifier = (string)args[2];
else
identifier = Encoding.UTF8.GetString((byte[])args[2]);
switch (identifier)
{
case "u": dtype = np.uint32; break;
case "c": dtype = np.complex_; break;
case "f": dtype = np.float32; break;
case "b": dtype = np.@bool; break;
default: throw new NotImplementedException($"Unsupported data type: {args[2]}");
}
return new MultiArrayPickleWarpper(shape, dtype);
}
}
}

+ 119
- 0
src/TensorFlowNET.Core/NumPy/Pickle/MultiArrayPickleWarpper.cs View File

@@ -0,0 +1,119 @@
using Newtonsoft.Json.Linq;
using Serilog.Debugging;
using System;
using System.Collections;
using System.Collections.Generic;
using System.Text;

namespace Tensorflow.NumPy.Pickle
{
public class MultiArrayPickleWarpper
{
public Shape reconstructedShape { get; set; }
public TF_DataType reconstructedDType { get; set; }
public NDArray reconstructedNDArray { get; set; }
public Array reconstructedMultiArray { get; set; }
public MultiArrayPickleWarpper(Shape shape, TF_DataType dtype)
{
reconstructedShape = shape;
reconstructedDType = dtype;
}
public void __setstate__(object[] args)
{
if (args.Length != 5)
throw new InvalidArgumentError($"Invalid number of arguments in NDArray.__setstate__. Expected five arguments. Given {args.Length} arguments.");

var version = (int)args[0]; // version

var arg1 = (object[])args[1];
var dims = new int[arg1.Length];
for (var i = 0; i < arg1.Length; i++)
{
dims[i] = (int)arg1[i];
}
var _ShapeLike = new Shape(dims); // shape

TF_DataType _DType_co = (DTypePickleWarpper)args[2]; // DType

var F_continuous = (bool)args[3]; // F-continuous
if (F_continuous)
throw new InvalidArgumentError("Fortran Continuous memory layout is not supported. Please use C-continuous layout or check the data format.");

var data = args[4]; // Data
/*
* If we ever need another pickle format, increment the version
* number. But we should still be able to handle the old versions.
*/
if (version < 0 || version > 4)
throw new ValueError($"can't handle version {version} of numpy.dtype pickle");

// TODO: Implement the missing details and checks from the official Numpy C code here.
// https://github.com/numpy/numpy/blob/2f0bd6e86a77e4401d0384d9a75edf9470c5deb6/numpy/core/src/multiarray/descriptor.c#L2761

if (data.GetType() == typeof(ArrayList))
{
Reconstruct((ArrayList)data);
}
else
throw new NotImplementedException("");
}
private void Reconstruct(ArrayList arrayList)
{
int ndim = 1;
var subArrayList = arrayList;
while (subArrayList.Count > 0 && subArrayList[0] != null && subArrayList[0].GetType() == typeof(ArrayList))
{
subArrayList = (ArrayList)subArrayList[0];
ndim += 1;
}
var type = subArrayList[0].GetType();
if (type == typeof(int))
{
if (ndim == 1)
{
int[] list = (int[])arrayList.ToArray(typeof(int));
Shape shape = new Shape(new int[] { arrayList.Count });
reconstructedMultiArray = list;
reconstructedNDArray = new NDArray(list, shape);
}
if (ndim == 2)
{
int secondDim = 0;
foreach (ArrayList subArray in arrayList)
{
secondDim = subArray.Count > secondDim ? subArray.Count : secondDim;
}
int[,] list = new int[arrayList.Count, secondDim];
for (int i = 0; i < arrayList.Count; i++)
{
var subArray = (ArrayList?)arrayList[i];
if (subArray == null)
throw new NullReferenceException("");
for (int j = 0; j < subArray.Count; j++)
{
var element = subArray[j];
if (element == null)
throw new NoNullAllowedException("the element of ArrayList cannot be null.");
list[i, j] = (int)element;
}
}
Shape shape = new Shape(new int[] { arrayList.Count, secondDim });
reconstructedMultiArray = list;
reconstructedNDArray = new NDArray(list, shape);
}
if (ndim > 2)
throw new NotImplementedException("can't handle ArrayList with more than two dimensions.");
}
else
throw new NotImplementedException("");
}
public static implicit operator Array(MultiArrayPickleWarpper arrayWarpper)
{
return arrayWarpper.reconstructedMultiArray;
}
public static implicit operator NDArray(MultiArrayPickleWarpper arrayWarpper)
{
return arrayWarpper.reconstructedNDArray;
}
}
}

+ 3
- 1
src/TensorFlowNET.Core/Numpy/Numpy.cs View File

@@ -43,7 +43,9 @@ public partial class np
public static readonly TF_DataType @decimal = TF_DataType.TF_DOUBLE;
public static readonly TF_DataType complex_ = TF_DataType.TF_COMPLEX;
public static readonly TF_DataType complex64 = TF_DataType.TF_COMPLEX64;
public static readonly TF_DataType complex128 = TF_DataType.TF_COMPLEX128;
public static readonly TF_DataType complex128 = TF_DataType.TF_COMPLEX128;
public static readonly TF_DataType @string = TF_DataType.TF_STRING;
public static readonly TF_DataType @object = TF_DataType.TF_VARIANT;
#endregion

public static double nan => double.NaN;


+ 1
- 0
src/TensorFlowNET.Core/Tensorflow.Binding.csproj View File

@@ -176,6 +176,7 @@ https://tensorflownet.readthedocs.io</Description>
<PackageReference Include="Newtonsoft.Json" Version="13.0.3" />
<PackageReference Include="OneOf" Version="3.0.255" />
<PackageReference Include="Protobuf.Text" Version="0.7.1" />
<PackageReference Include="Razorvine.Pickle" Version="1.4.0" />
<PackageReference Include="Serilog.Sinks.Console" Version="4.1.0" />
</ItemGroup>



+ 6
- 0
src/TensorFlowNET.Core/tensorflow.cs View File

@@ -14,6 +14,7 @@
limitations under the License.
******************************************************************************/

using Razorvine.Pickle;
using Serilog;
using Serilog.Core;
using System.Reflection;
@@ -22,6 +23,7 @@ using Tensorflow.Contexts;
using Tensorflow.Eager;
using Tensorflow.Gradients;
using Tensorflow.Keras;
using Tensorflow.NumPy.Pickle;

namespace Tensorflow
{
@@ -98,6 +100,10 @@ namespace Tensorflow
"please visit https://github.com/SciSharp/TensorFlow.NET. If it still not work after installing the backend, please submit an " +
"issue to https://github.com/SciSharp/TensorFlow.NET/issues");
}

// register numpy reconstructor for pickle
Unpickler.registerConstructor("numpy.core.multiarray", "_reconstruct", new MultiArrayConstructor());
Unpickler.registerConstructor("numpy", "dtype", new DtypeConstructor());
}

public string VERSION => c_api.StringPiece(c_api.TF_Version());


+ 181
- 56
src/TensorFlowNET.Keras/Datasets/Imdb.cs View File

@@ -3,8 +3,6 @@ using System.Collections.Generic;
using System.IO;
using System.Text;
using Tensorflow.Keras.Utils;
using Tensorflow.NumPy;
using System.Linq;

namespace Tensorflow.Keras.Datasets
{
@@ -12,11 +10,57 @@ namespace Tensorflow.Keras.Datasets
/// This is a dataset of 25,000 movies reviews from IMDB, labeled by sentiment
/// (positive/negative). Reviews have been preprocessed, and each review is
/// encoded as a list of word indexes(integers).
/// For convenience, words are indexed by overall frequency in the dataset,
/// so that for instance the integer "3" encodes the 3rd most frequent word in
/// the data.This allows for quick filtering operations such as:
/// "only consider the top 10,000 most
/// common words, but eliminate the top 20 most common words".
/// As a convention, "0" does not stand for a specific word, but instead is used
/// to encode the pad token.
/// Args:
/// path: where to cache the data (relative to %TEMP%/imdb/imdb.npz).
/// num_words: integer or None.Words are
/// ranked by how often they occur(in the training set) and only
/// the `num_words` most frequent words are kept.Any less frequent word
/// will appear as `oov_char` value in the sequence data.If None,
/// all words are kept.Defaults to `None`.
/// skip_top: skip the top N most frequently occurring words
/// (which may not be informative). These words will appear as
/// `oov_char` value in the dataset.When 0, no words are
/// skipped. Defaults to `0`.
/// maxlen: int or None.Maximum sequence length.
/// Any longer sequence will be truncated. None, means no truncation.
/// Defaults to `None`.
/// seed: int. Seed for reproducible data shuffling.
/// start_char: int. The start of a sequence will be marked with this
/// character. 0 is usually the padding character. Defaults to `1`.
/// oov_char: int. The out-of-vocabulary character.
/// Words that were cut out because of the `num_words` or
/// `skip_top` limits will be replaced with this character.
/// index_from: int. Index actual words with this index and higher.
/// Returns:
/// Tuple of Numpy arrays: `(x_train, labels_train), (x_test, labels_test)`.
///
/// ** x_train, x_test**: lists of sequences, which are lists of indexes
/// (integers). If the num_words argument was specific, the maximum
/// possible index value is `num_words - 1`. If the `maxlen` argument was
/// specified, the largest possible sequence length is `maxlen`.
///
/// ** labels_train, labels_test**: lists of integer labels(1 or 0).
///
/// Raises:
/// ValueError: in case `maxlen` is so low
/// that no input sequence could be kept.
/// Note that the 'out of vocabulary' character is only used for
/// words that were present in the training set but are not included
/// because they're not making the `num_words` cut here.
/// Words that were not seen in the training set but are in the test set
/// have simply been skipped.
/// </summary>
/// """Loads the [IMDB dataset](https://ai.stanford.edu/~amaas/data/sentiment/).
public class Imdb
{
string origin_folder = "https://storage.googleapis.com/tensorflow/tf-keras-datasets/";
string file_name = "imdb.npz";
string dest_folder = "imdb";

/// <summary>
@@ -31,40 +75,150 @@ namespace Tensorflow.Keras.Datasets
/// <param name="oov_char"></param>
/// <param name="index_from"></param>
/// <returns></returns>
public DatasetPass load_data(string? path = "imdb.npz",
int num_words = -1,
public DatasetPass load_data(
string path = "imdb.npz",
int? num_words = null,
int skip_top = 0,
int maxlen = -1,
int? maxlen = null,
int seed = 113,
int start_char = 1,
int oov_char= 2,
int? start_char = 1,
int? oov_char = 2,
int index_from = 3)
{
if (maxlen == -1) throw new InvalidArgumentError("maxlen must be assigned.");
var dst = path ?? Download();
path = data_utils.get_file(
path,
origin: Path.Combine(origin_folder, "imdb.npz"),
file_hash: "69664113be75683a8fe16e3ed0ab59fda8886cb3cd7ada244f7d9544e4676b9f"
);
path = Path.Combine(path, "imdb.npz");
var fileBytes = File.ReadAllBytes(path);
var (x_train, x_test) = LoadX(fileBytes);
var (labels_train, labels_test) = LoadY(fileBytes);

var lines = File.ReadAllLines(Path.Combine(dst, "imdb_train.txt"));
var x_train_string = new string[lines.Length];
var y_train = np.zeros(new int[] { lines.Length }, np.int64);
for (int i = 0; i < lines.Length; i++)
var indices = np.arange<int>(len(x_train));
np.random.shuffle(indices, seed);
x_train = x_train[indices];
labels_train = labels_train[indices];

indices = np.arange<int>(len(x_test));
np.random.shuffle(indices, seed);
x_test = x_test[indices];
labels_test = labels_test[indices];

var x_train_array = (int[,])x_train.ToMultiDimArray<int>();
var x_test_array = (int[,])x_test.ToMultiDimArray<int>();
var labels_train_array = (long[])labels_train.ToArray<long>();
var labels_test_array = (long[])labels_test.ToArray<long>();

if (start_char != null)
{
y_train[i] = long.Parse(lines[i].Substring(0, 1));
x_train_string[i] = lines[i].Substring(2);
int[,] new_x_train_array = new int[x_train_array.GetLength(0), x_train_array.GetLength(1) + 1];
for (var i = 0; i < x_train_array.GetLength(0); i++)
{
new_x_train_array[i, 0] = (int)start_char;
for (var j = 0; j < x_train_array.GetLength(1); j++)
{
if (x_train_array[i, j] == 0)
break;
new_x_train_array[i, j + 1] = x_train_array[i, j];
}
}
int[,] new_x_test_array = new int[x_test_array.GetLength(0), x_test_array.GetLength(1) + 1];
for (var i = 0; i < x_test_array.GetLength(0); i++)
{
new_x_test_array[i, 0] = (int)start_char;
for (var j = 0; j < x_test_array.GetLength(1); j++)
{
if (x_test_array[i, j] == 0)
break;
new_x_test_array[i, j + 1] = x_test_array[i, j];
}
}
x_train_array = new_x_train_array;
x_test_array = new_x_test_array;
}
else if (index_from != 0)
{
for (var i = 0; i < x_train_array.GetLength(0); i++)
{
for (var j = 0; j < x_train_array.GetLength(1); j++)
{
if (x_train_array[i, j] == 0)
break;
x_train_array[i, j] += index_from;
}
}
for (var i = 0; i < x_test_array.GetLength(0); i++)
{
for (var j = 0; j < x_test_array.GetLength(1); j++)
{
if (x_test_array[i, j] == 0)
break;
x_test[i, j] += index_from;
}
}
}

var x_train = keras.preprocessing.sequence.pad_sequences(PraseData(x_train_string), maxlen: maxlen);
if (maxlen == null)
{
maxlen = max(x_train_array.GetLength(1), x_test_array.GetLength(1));
}
(x_train, labels_train) = data_utils._remove_long_seq((int)maxlen, x_train_array, labels_train_array);
(x_test, labels_test) = data_utils._remove_long_seq((int)maxlen, x_test_array, labels_test_array);
if (x_train.size == 0 || x_test.size == 0)
throw new ValueError("After filtering for sequences shorter than maxlen=" +
$"{maxlen}, no sequence was kept. Increase maxlen.");

lines = File.ReadAllLines(Path.Combine(dst, "imdb_test.txt"));
var x_test_string = new string[lines.Length];
var y_test = np.zeros(new int[] { lines.Length }, np.int64);
for (int i = 0; i < lines.Length; i++)
var xs = np.concatenate(new[] { x_train, x_test });
var labels = np.concatenate(new[] { labels_train, labels_test });
var xs_array = (int[,])xs.ToMultiDimArray<int>();

if (num_words == null)
{
y_test[i] = long.Parse(lines[i].Substring(0, 1));
x_test_string[i] = lines[i].Substring(2);
num_words = 0;
for (var i = 0; i < xs_array.GetLength(0); i++)
for (var j = 0; j < xs_array.GetLength(1); j++)
num_words = max((int)num_words, (int)xs_array[i, j]);
}

var x_test = keras.preprocessing.sequence.pad_sequences(PraseData(x_test_string), maxlen: maxlen);
// by convention, use 2 as OOV word
// reserve 'index_from' (=3 by default) characters:
// 0 (padding), 1 (start), 2 (OOV)
if (oov_char != null)
{
int[,] new_xs_array = new int[xs_array.GetLength(0), xs_array.GetLength(1)];
for (var i = 0; i < xs_array.GetLength(0); i++)
{
for (var j = 0; j < xs_array.GetLength(1); j++)
{
if (xs_array[i, j] == 0 || skip_top <= xs_array[i, j] && xs_array[i, j] < num_words)
new_xs_array[i, j] = xs_array[i, j];
else
new_xs_array[i, j] = (int)oov_char;
}
}
xs = new NDArray(new_xs_array);
}
else
{
int[,] new_xs_array = new int[xs_array.GetLength(0), xs_array.GetLength(1)];
for (var i = 0; i < xs_array.GetLength(0); i++)
{
int k = 0;
for (var j = 0; j < xs_array.GetLength(1); j++)
{
if (xs_array[i, j] == 0 || skip_top <= xs_array[i, j] && xs_array[i, j] < num_words)
new_xs_array[i, k++] = xs_array[i, j];
}
}
xs = new NDArray(new_xs_array);
}

var idx = len(x_train);
x_train = xs[$"0:{idx}"];
x_test = xs[$"{idx}:"];
var y_train = labels[$"0:{idx}"];
var y_test = labels[$"{idx}:"];

return new DatasetPass
{
@@ -75,8 +229,8 @@ namespace Tensorflow.Keras.Datasets

(NDArray, NDArray) LoadX(byte[] bytes)
{
var y = np.Load_Npz<byte[]>(bytes);
return (y["x_train.npy"], y["x_test.npy"]);
var x = np.Load_Npz<int[,]>(bytes);
return (x["x_train.npy"], x["x_test.npy"]);
}

(NDArray, NDArray) LoadY(byte[] bytes)
@@ -84,34 +238,5 @@ namespace Tensorflow.Keras.Datasets
var y = np.Load_Npz<long[]>(bytes);
return (y["y_train.npy"], y["y_test.npy"]);
}

string Download()
{
var dst = Path.Combine(Path.GetTempPath(), dest_folder);
Directory.CreateDirectory(dst);

Web.Download(origin_folder + file_name, dst, file_name);

return dst;
// return Path.Combine(dst, file_name);
}

protected IEnumerable<int[]> PraseData(string[] x)
{
var data_list = new List<int[]>();
for (int i = 0; i < len(x); i++)
{
var list_string = x[i];
var cleaned_list_string = list_string.Replace("[", "").Replace("]", "").Replace(" ", "");
string[] number_strings = cleaned_list_string.Split(',');
int[] numbers = new int[number_strings.Length];
for (int j = 0; j < number_strings.Length; j++)
{
numbers[j] = int.Parse(number_strings[j]);
}
data_list.Add(numbers);
}
return data_list;
}
}
}

+ 49
- 0
src/TensorFlowNET.Keras/Utils/data_utils.cs View File

@@ -39,5 +39,54 @@ namespace Tensorflow.Keras.Utils

return datadir;
}

public static (NDArray, NDArray) _remove_long_seq(int maxlen, NDArray seq, NDArray label)
{
/*Removes sequences that exceed the maximum length.

Args:
maxlen: Int, maximum length of the output sequences.
seq: List of lists, where each sublist is a sequence.
label: List where each element is an integer.

Returns:
new_seq, new_label: shortened lists for `seq` and `label`.

*/
List<int[]> new_seq = new List<int[]>();
List<long> new_label = new List<long>();

var seq_array = (int[,])seq.ToMultiDimArray<int>();
var label_array = (long[])label.ToArray<long>();
for (var i = 0; i < seq_array.GetLength(0); i++)
{
if (maxlen < seq_array.GetLength(1) && seq_array[i,maxlen] != 0)
continue;
int[] sentence = new int[maxlen];
for (var j = 0; j < maxlen && j < seq_array.GetLength(1); j++)
{
sentence[j] = seq_array[i, j];
}
new_seq.Add(sentence);
new_label.Add(label_array[i]);
}

int[,] new_seq_array = new int[new_seq.Count, maxlen];
long[] new_label_array = new long[new_label.Count];

for (var i = 0; i < new_seq.Count; i++)
{
for (var j = 0; j < maxlen; j++)
{
new_seq_array[i, j] = new_seq[i][j];
}
}

for (var i = 0; i < new_label.Count; i++)
{
new_label_array[i] = new_label[i];
}
return (new_seq_array, new_label_array);
}
}
}

+ 38
- 0
test/TensorFlowNET.UnitTest/Dataset/DatasetTest.cs View File

@@ -1,7 +1,10 @@
using Microsoft.VisualStudio.TestTools.UnitTesting;
using System;
using System.Collections.Generic;
using System.Linq;
using Tensorflow.NumPy;
using static Tensorflow.Binding;
using static Tensorflow.KerasApi;

namespace TensorFlowNET.UnitTest.Dataset
{
@@ -195,5 +198,40 @@ namespace TensorFlowNET.UnitTest.Dataset

Assert.IsFalse(allEqual);
}
[Ignore]
[TestMethod]
public void GetData()
{
var vocab_size = 20000; // Only consider the top 20k words
var maxlen = 200; // Only consider the first 200 words of each movie review
var dataset = keras.datasets.imdb.load_data(num_words: vocab_size, maxlen: maxlen);
var x_train = dataset.Train.Item1;
var y_train = dataset.Train.Item2;
var x_val = dataset.Test.Item1;
var y_val = dataset.Test.Item2;

x_train = keras.preprocessing.sequence.pad_sequences(RemoveZeros(x_train), maxlen: maxlen);
x_val = keras.preprocessing.sequence.pad_sequences(RemoveZeros(x_val), maxlen: maxlen);
print(len(x_train) + " Training sequences");
print(len(x_val) + " Validation sequences");
}
IEnumerable<int[]> RemoveZeros(NDArray data)
{
var data_array = (int[,])data.ToMultiDimArray<int>();
List<int[]> new_data = new List<int[]>();
for (var i = 0; i < data_array.GetLength(0); i++)
{
List<int> new_array = new List<int>();
for (var j = 0; j < data_array.GetLength(1); j++)
{
if (data_array[i, j] == 0)
break;
else
new_array.Add(data_array[i, j]);
}
new_data.Add(new_array.ToArray());
}
return new_data;
}
}
}

Loading…
Cancel
Save