diff --git a/src/TensorFlowNET.Core/NumPy/DtypeConstructor.cs b/src/TensorFlowNET.Core/NumPy/DtypeConstructor.cs
new file mode 100644
index 00000000..f84f408e
--- /dev/null
+++ b/src/TensorFlowNET.Core/NumPy/DtypeConstructor.cs
@@ -0,0 +1,40 @@
+using System;
+using System.Collections.Generic;
+using System.Diagnostics.CodeAnalysis;
+using System.Text;
+using Razorvine.Pickle;
+
+namespace Tensorflow.NumPy
+{
+ ///
+ ///
+ ///
+ [SuppressMessage("ReSharper", "InconsistentNaming")]
+ [SuppressMessage("ReSharper", "MemberCanBePrivate.Global")]
+ [SuppressMessage("ReSharper", "MemberCanBeMadeStatic.Global")]
+ class DtypeConstructor : IObjectConstructor
+ {
+ public object construct(object[] args)
+ {
+ Console.WriteLine("DtypeConstructor");
+ Console.WriteLine(args.Length);
+ for (int i = 0; i < args.Length; i++)
+ {
+ Console.WriteLine(args[i]);
+ }
+ return new demo();
+ }
+ }
+ class demo
+ {
+ public void __setstate__(object[] args)
+ {
+ Console.WriteLine("demo __setstate__");
+ Console.WriteLine(args.Length);
+ for (int i = 0; i < args.Length; i++)
+ {
+ Console.WriteLine(args[i]);
+ }
+ }
+ }
+}
diff --git a/src/TensorFlowNET.Core/NumPy/Implementation/NumPyImpl.Creation.cs b/src/TensorFlowNET.Core/NumPy/Implementation/NumPyImpl.Creation.cs
index f29879b0..80b62198 100644
--- a/src/TensorFlowNET.Core/NumPy/Implementation/NumPyImpl.Creation.cs
+++ b/src/TensorFlowNET.Core/NumPy/Implementation/NumPyImpl.Creation.cs
@@ -4,6 +4,7 @@ using System.IO;
using System.Linq;
using System.Text;
using Tensorflow.Util;
+using Razorvine.Pickle;
using static Tensorflow.Binding;
namespace Tensorflow.NumPy
@@ -93,10 +94,25 @@ namespace Tensorflow.NumPy
var buffer = reader.ReadBytes(bytes * total);
System.Buffer.BlockCopy(buffer, 0, matrix, 0, buffer.Length);
-
return matrix;
}
+ NDArray ReadObjectMatrix(BinaryReader reader, Array matrix, int[] shape)
+ {
+ //int data = reader.ReadByte();
+ //Console.WriteLine(data);
+ //Console.WriteLine(reader.ReadByte());
+ Stream stream = reader.BaseStream;
+ Unpickler.registerConstructor("numpy.core.multiarray", "_reconstruct", new MultiArrayConstructor());
+ Unpickler.registerConstructor("numpy", "dtype", new DtypeConstructor());
+
+ var unpickler = new Unpickler();
+
+ NDArray result = (NDArray) unpickler.load(stream);
+ Console.WriteLine(result.dims);
+ return result;
+ }
+
public (NDArray, NDArray) meshgrid(T[] array, bool copy = true, bool sparse = false)
{
var tensors = array_ops.meshgrid(array, copy: copy, sparse: sparse);
diff --git a/src/TensorFlowNET.Core/NumPy/Implementation/NumPyImpl.load.cs b/src/TensorFlowNET.Core/NumPy/Implementation/NumPyImpl.load.cs
index 05f53d5e..789f119a 100644
--- a/src/TensorFlowNET.Core/NumPy/Implementation/NumPyImpl.load.cs
+++ b/src/TensorFlowNET.Core/NumPy/Implementation/NumPyImpl.load.cs
@@ -27,9 +27,20 @@ namespace Tensorflow.NumPy
Array matrix = Array.CreateInstance(type, shape);
//if (type == typeof(String))
- //return ReadStringMatrix(reader, matrix, bytes, type, shape);
+ //return ReadStringMatrix(reader, matrix, bytes, type, shape);
+ NDArray res = ReadObjectMatrix(reader, matrix, shape);
+ Console.WriteLine("LoadMatrix");
+ Console.WriteLine(res.dims[0]);
+ Console.WriteLine((int)res[0][0]);
+ Console.WriteLine(res.dims[1]);
+ //if (type == typeof(Object))
+ //{
+
+ //}
+ //else
return ReadValueMatrix(reader, matrix, bytes, type, shape);
}
+
}
public T Load(Stream stream)
@@ -37,7 +48,7 @@ namespace Tensorflow.NumPy
ICloneable, IList, ICollection, IEnumerable, IStructuralComparable, IStructuralEquatable
{
// if (typeof(T).IsArray && (typeof(T).GetElementType().IsArray || typeof(T).GetElementType() == typeof(string)))
- // return LoadJagged(stream) as T;
+ // return LoadJagged(stream) as T;
return LoadMatrix(stream) as T;
}
@@ -48,7 +59,7 @@ namespace Tensorflow.NumPy
shape = null;
// The first 6 bytes are a magic string: exactly "x93NUMPY"
- if (reader.ReadChar() != 63) return false;
+ if (reader.ReadByte() != 0x93) return false;
if (reader.ReadChar() != 'N') return false;
if (reader.ReadChar() != 'U') return false;
if (reader.ReadChar() != 'M') return false;
@@ -64,6 +75,7 @@ namespace Tensorflow.NumPy
ushort len = reader.ReadUInt16();
string header = new String(reader.ReadChars(len));
+ Console.WriteLine(header);
string mark = "'descr': '";
int s = header.IndexOf(mark) + mark.Length;
int e = header.IndexOf("'", s + 1);
@@ -93,7 +105,7 @@ namespace Tensorflow.NumPy
Type GetType(string dtype, out int bytes, out bool? isLittleEndian)
{
isLittleEndian = IsLittleEndian(dtype);
- bytes = Int32.Parse(dtype.Substring(2));
+ bytes = dtype.Length > 2 ? Int32.Parse(dtype.Substring(2)) : 0;
string typeCode = dtype.Substring(1);
@@ -121,6 +133,8 @@ namespace Tensorflow.NumPy
return typeof(Double);
if (typeCode.StartsWith("S"))
return typeof(String);
+ if (typeCode == "O")
+ return typeof(Object);
throw new NotSupportedException();
}
diff --git a/src/TensorFlowNET.Core/NumPy/MultiArrayConstructor.cs b/src/TensorFlowNET.Core/NumPy/MultiArrayConstructor.cs
new file mode 100644
index 00000000..92927cd5
--- /dev/null
+++ b/src/TensorFlowNET.Core/NumPy/MultiArrayConstructor.cs
@@ -0,0 +1,44 @@
+using System;
+using System.Collections.Generic;
+using System.Diagnostics.CodeAnalysis;
+using System.Text;
+using Razorvine.Pickle;
+
+namespace Tensorflow.NumPy
+{
+ ///
+ /// Creates multiarrays of objects. Returns a primitive type multiarray such as int[][] if
+ /// the objects are ints, etc.
+ ///
+ [SuppressMessage("ReSharper", "InconsistentNaming")]
+ [SuppressMessage("ReSharper", "MemberCanBePrivate.Global")]
+ [SuppressMessage("ReSharper", "MemberCanBeMadeStatic.Global")]
+ public class MultiArrayConstructor : IObjectConstructor
+ {
+ public object construct(object[] args)
+ {
+ //Console.WriteLine(args.Length);
+ //for (int i = 0; i < args.Length; i++)
+ //{
+ // Console.WriteLine(args[i]);
+ //}
+ Console.WriteLine("MultiArrayConstructor");
+
+ var arg1 = (Object[])args[1];
+ var dims = new int[arg1.Length];
+ for (var i = 0; i < arg1.Length; i++)
+ {
+ dims[i] = (int)arg1[i];
+ }
+
+ var dtype = TF_DataType.DtInvalid;
+ switch (args[2])
+ {
+ case "b": dtype = TF_DataType.DtUint8Ref; break;
+ default: throw new NotImplementedException("cannot parse" + args[2]);
+ }
+ return new NDArray(new Shape(dims), dtype);
+
+ }
+ }
+}
diff --git a/src/TensorFlowNET.Core/NumPy/NDArray.Pickle.cs b/src/TensorFlowNET.Core/NumPy/NDArray.Pickle.cs
new file mode 100644
index 00000000..b4d66243
--- /dev/null
+++ b/src/TensorFlowNET.Core/NumPy/NDArray.Pickle.cs
@@ -0,0 +1,19 @@
+using System;
+using System.Collections.Generic;
+using System.Text;
+
+namespace Tensorflow.NumPy
+{
+ public partial class NDArray
+ {
+ public void __setstate__(object[] args)
+ {
+ Console.WriteLine("NDArray __setstate__");
+ Console.WriteLine(args.Length);
+ for (int i = 0; i < args.Length; i++)
+ {
+ Console.WriteLine(args[i]);
+ }
+ }
+ }
+}
diff --git a/src/TensorFlowNET.Core/Tensorflow.Binding.csproj b/src/TensorFlowNET.Core/Tensorflow.Binding.csproj
index 09f5b077..38778c3f 100644
--- a/src/TensorFlowNET.Core/Tensorflow.Binding.csproj
+++ b/src/TensorFlowNET.Core/Tensorflow.Binding.csproj
@@ -112,6 +112,7 @@ https://tensorflownet.readthedocs.io
+
diff --git a/src/TensorFlowNET.Keras/Datasets/Imdb.cs b/src/TensorFlowNET.Keras/Datasets/Imdb.cs
index 56b0d2a7..016b352d 100644
--- a/src/TensorFlowNET.Keras/Datasets/Imdb.cs
+++ b/src/TensorFlowNET.Keras/Datasets/Imdb.cs
@@ -5,6 +5,13 @@ using System.Text;
using Tensorflow.Keras.Utils;
using Tensorflow.NumPy;
using System.Linq;
+using Google.Protobuf.Collections;
+using Microsoft.VisualBasic;
+using OneOf.Types;
+using static HDF.PInvoke.H5;
+using System.Data;
+using System.Reflection.Emit;
+using System.Xml.Linq;
namespace Tensorflow.Keras.Datasets
{
@@ -12,13 +19,59 @@ namespace Tensorflow.Keras.Datasets
/// This is a dataset of 25,000 movies reviews from IMDB, labeled by sentiment
/// (positive/negative). Reviews have been preprocessed, and each review is
/// encoded as a list of word indexes(integers).
+ /// For convenience, words are indexed by overall frequency in the dataset,
+ /// so that for instance the integer "3" encodes the 3rd most frequent word in
+ /// the data.This allows for quick filtering operations such as:
+ /// "only consider the top 10,000 most
+ /// common words, but eliminate the top 20 most common words".
+ /// As a convention, "0" does not stand for a specific word, but instead is used
+ /// to encode the pad token.
+ /// Args:
+ /// path: where to cache the data (relative to %TEMP%/imdb/imdb.npz).
+ /// num_words: integer or None.Words are
+ /// ranked by how often they occur(in the training set) and only
+ /// the `num_words` most frequent words are kept.Any less frequent word
+ /// will appear as `oov_char` value in the sequence data.If None,
+ /// all words are kept.Defaults to `None`.
+ /// skip_top: skip the top N most frequently occurring words
+ /// (which may not be informative). These words will appear as
+ /// `oov_char` value in the dataset.When 0, no words are
+ /// skipped. Defaults to `0`.
+ /// maxlen: int or None.Maximum sequence length.
+ /// Any longer sequence will be truncated. None, means no truncation.
+ /// Defaults to `None`.
+ /// seed: int. Seed for reproducible data shuffling.
+ /// start_char: int. The start of a sequence will be marked with this
+ /// character. 0 is usually the padding character. Defaults to `1`.
+ /// oov_char: int. The out-of-vocabulary character.
+ /// Words that were cut out because of the `num_words` or
+ /// `skip_top` limits will be replaced with this character.
+ /// index_from: int. Index actual words with this index and higher.
+ /// Returns:
+ /// Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
+ ///
+ /// ** x_train, x_test**: lists of sequences, which are lists of indexes
+ /// (integers). If the num_words argument was specific, the maximum
+ /// possible index value is `num_words - 1`. If the `maxlen` argument was
+ /// specified, the largest possible sequence length is `maxlen`.
+ ///
+ /// ** y_train, y_test**: lists of integer labels(1 or 0).
+ ///
+ /// Raises:
+ /// ValueError: in case `maxlen` is so low
+ /// that no input sequence could be kept.
+ /// Note that the 'out of vocabulary' character is only used for
+ /// words that were present in the training set but are not included
+ /// because they're not making the `num_words` cut here.
+ /// Words that were not seen in the training set but are in the test set
+ /// have simply been skipped.
///
+ /// """Loads the [IMDB dataset](https://ai.stanford.edu/~amaas/data/sentiment/).
public class Imdb
{
string origin_folder = "https://storage.googleapis.com/tensorflow/tf-keras-datasets/";
string file_name = "imdb.npz";
string dest_folder = "imdb";
-
///
/// Loads the [IMDB dataset](https://ai.stanford.edu/~amaas/data/sentiment/).
///
@@ -41,8 +94,10 @@ namespace Tensorflow.Keras.Datasets
int index_from = 3)
{
var dst = Download();
-
- var lines = File.ReadAllLines(Path.Combine(dst, "imdb_train.txt"));
+ var fileBytes = File.ReadAllBytes(Path.Combine(dst, file_name));
+ var (x_train, x_test) = LoadX(fileBytes);
+ var (y_train, y_test) = LoadY(fileBytes);
+ /*var lines = File.ReadAllLines(Path.Combine(dst, "imdb_train.txt"));
var x_train_string = new string[lines.Length];
var y_train = np.zeros(new int[] { lines.Length }, np.int64);
for (int i = 0; i < lines.Length; i++)
@@ -62,7 +117,7 @@ namespace Tensorflow.Keras.Datasets
x_test_string[i] = lines[i].Substring(2);
}
- var x_test = np.array(x_test_string);
+ var x_test = np.array(x_test_string);*/
return new DatasetPass
{
diff --git a/test/TensorFlowNET.UnitTest/Dataset/DatasetTest.cs b/test/TensorFlowNET.UnitTest/Dataset/DatasetTest.cs
index 8317346e..778290bb 100644
--- a/test/TensorFlowNET.UnitTest/Dataset/DatasetTest.cs
+++ b/test/TensorFlowNET.UnitTest/Dataset/DatasetTest.cs
@@ -1,7 +1,9 @@
using Microsoft.VisualStudio.TestTools.UnitTesting;
using System;
+using System.Collections.Generic;
using System.Linq;
using static Tensorflow.Binding;
+using static Tensorflow.KerasApi;
namespace TensorFlowNET.UnitTest.Dataset
{
@@ -195,5 +197,20 @@ namespace TensorFlowNET.UnitTest.Dataset
Assert.IsFalse(allEqual);
}
+ [TestMethod]
+ public void GetData()
+ {
+ var vocab_size = 20000; // Only consider the top 20k words
+ var maxlen = 200; // Only consider the first 200 words of each movie review
+ var dataset = keras.datasets.imdb.load_data(num_words: vocab_size);
+ var x_train = dataset.Train.Item1;
+ var y_train = dataset.Train.Item2;
+ var x_val = dataset.Test.Item1;
+ var y_val = dataset.Test.Item2;
+ print(len(x_train) + "Training sequences");
+ print(len(x_val) + "Validation sequences");
+ x_train = keras.preprocessing.sequence.pad_sequences((IEnumerable)x_train, maxlen: maxlen);
+ x_val = keras.preprocessing.sequence.pad_sequences((IEnumerable)x_val, maxlen: maxlen);
+ }
}
}