@@ -67,8 +67,26 @@ namespace Tensorflow | |||
string name = null, string @uint = "BYTE") | |||
=> ops.substr(input, pos, len, @uint: @uint, name: name); | |||
/// <summary> | |||
/// String lengths of `input`. | |||
/// </summary> | |||
/// <param name="input"></param> | |||
/// <param name="name"></param> | |||
/// <param name="unit"></param> | |||
/// <returns></returns> | |||
public Tensor string_length(Tensor input, string name = null, string unit = "BYTE") | |||
=> ops.string_length(input, name: name, unit: unit); | |||
public RaggedTensor split(Tensor input, string sep = "", int maxsplit = -1, string name = null) | |||
=> ops.string_split_v2(input, sep: sep, maxsplit : maxsplit, name : name); | |||
public (RaggedTensor, RaggedTensor) unicode_decode_with_offsets(Tensor input, string input_encoding, | |||
string errors = "replace", int replacement_char = 0xFFFD, | |||
bool replace_control_characters = false, string name = null) | |||
=> ops.unicode_decode_with_offsets(input, input_encoding, errors, | |||
replacement_char: replacement_char, | |||
replace_control_characters: replace_control_characters, | |||
name: name); | |||
} | |||
} | |||
} |
@@ -44,6 +44,22 @@ namespace Tensorflow | |||
=> tf.Context.ExecuteOp("Substr", name, new ExecuteOpArgs(input, pos, len) | |||
.SetAttributes(new { unit = @uint })); | |||
/// <summary> | |||
/// Computes the length of each string given in the input tensor. | |||
/// </summary> | |||
/// <param name="input"></param> | |||
/// <param name="name"></param> | |||
/// <param name="unit"></param> | |||
/// <returns></returns> | |||
public Tensor string_length(Tensor input, string name = null, string unit = "BYTE") | |||
=> tf.Context.ExecuteOp("StringLength", name, new ExecuteOpArgs(input) | |||
{ | |||
GetGradientAttrs = op => new | |||
{ | |||
unit = op.get_attr<string>("unit") | |||
} | |||
}.SetAttributes(new { unit })); | |||
public RaggedTensor string_split_v2(Tensor input, string sep = "", int maxsplit = -1, string name = null) | |||
{ | |||
return tf_with(ops.name_scope(name, "StringSplit"), scope => | |||
@@ -69,5 +85,49 @@ namespace Tensorflow | |||
validate: false); | |||
}); | |||
} | |||
public (RaggedTensor, RaggedTensor) unicode_decode_with_offsets(Tensor input, string input_encoding, string errors, | |||
int replacement_char = 0xFFFD, bool replace_control_characters = false, string name = null) | |||
{ | |||
return tf_with(ops.name_scope(name, "UnicodeDecodeWithOffsets"), scope => | |||
{ | |||
var (codepoints, byte_start_offsets) = _unicode_decode(input, input_encoding, errors, | |||
replacement_char, replace_control_characters, | |||
with_offsets: true, name: name); | |||
return (codepoints, byte_start_offsets); | |||
}); | |||
} | |||
(RaggedTensor, RaggedTensor) _unicode_decode(Tensor input, string input_encoding, string errors, int replacement_char, | |||
bool replace_control_characters, bool with_offsets, string name = null) | |||
{ | |||
if (with_offsets) | |||
{ | |||
var flat_result = tf.Context.ExecuteOp("UnicodeDecodeWithOffsets", name, new ExecuteOpArgs(input) | |||
{ | |||
GetGradientAttrs = op => new | |||
{ | |||
input_encoding = op.get_attr<string>("input_encoding"), | |||
errors = op.get_attr<string>("errors"), | |||
replacement_char = op.get_attr<int>("replacement_char"), | |||
replace_control_characters = op.get_attr<bool>("replace_control_characters"), | |||
Tsplits = op.get_attr<TF_DataType>("Tsplits") | |||
} | |||
}.SetAttributes(new | |||
{ | |||
input_encoding, | |||
errors, | |||
replacement_char, | |||
replace_control_characters | |||
})); | |||
var codepoints = RaggedTensor.from_row_splits(flat_result[1], flat_result[0], validate: false); | |||
var offsets = RaggedTensor.from_row_splits(flat_result[2], flat_result[0], validate: false); | |||
return (codepoints, offsets); | |||
} | |||
return (null, null); | |||
} | |||
} | |||
} |
@@ -20,6 +20,7 @@ using System.Text; | |||
using System.Linq; | |||
using Tensorflow.Framework; | |||
using static Tensorflow.Binding; | |||
using NumSharp; | |||
namespace Tensorflow | |||
{ | |||
@@ -30,6 +31,8 @@ namespace Tensorflow | |||
{ | |||
Tensor _values; | |||
RowPartition _row_partition; | |||
Tensor _row_splits => _row_partition.row_splits; | |||
public TF_DataType dtype => _values.dtype; | |||
public TensorShape shape | |||
{ | |||
@@ -41,6 +44,28 @@ namespace Tensorflow | |||
} | |||
} | |||
public RaggedTensor this[params Slice[] slices] | |||
{ | |||
get | |||
{ | |||
var row_key = slices[0]; | |||
var inner_keys = slices.Skip(1).ToArray(); | |||
var args = tensor_util.ParseSlices(slices); | |||
return tf_with(ops.name_scope(null, "RaggedGetItem", args), scope => | |||
{ | |||
string name = scope; | |||
return _ragged_getitem_inner_dimensions(this, inner_keys); | |||
}); | |||
} | |||
} | |||
RaggedTensor _ragged_getitem_inner_dimensions(RaggedTensor input, Slice[] slices) | |||
{ | |||
return input; | |||
} | |||
public RaggedTensor(Tensor values, | |||
bool @internal = true, | |||
RowPartition row_partition = null) | |||
@@ -75,13 +100,44 @@ namespace Tensorflow | |||
}); | |||
} | |||
public static RaggedTensor from_row_splits(Tensor values, Tensor row_splits, | |||
string name = null, bool validate = true) | |||
{ | |||
return tf_with(ops.name_scope(name, "RaggedFromRowSplits"), scope => | |||
{ | |||
var row_partition = RowPartition.from_row_splits(row_splits, | |||
validate: validate); | |||
return from_row_partition(values, row_partition, validate: validate); | |||
}); | |||
} | |||
Tensor _to_variant(bool batched_input = false, string name = null) | |||
=> tf_with(ops.name_scope(name, "RaggedToVariant"), scope => | |||
{ | |||
return tf.Context.ExecuteOp("RaggedTensorToVariant", name, | |||
new ExecuteOpArgs(nested_row_splits, flat_values) | |||
{ | |||
GetGradientAttrs = op => new | |||
{ | |||
RAGGED_RANK = op.get_attr<int>("RAGGED_RANK"), | |||
Tvalues = op.get_attr<TF_DataType>("Tvalues"), | |||
Tsplits = op.get_attr<TF_DataType>("Tsplits"), | |||
batched_input = op.get_attr<bool>("batched_input") | |||
} | |||
}.SetAttributes(new { batched_input })); | |||
}); | |||
Tensor flat_values | |||
=> _values; | |||
Tensor[] nested_row_splits | |||
=> new[] { _row_splits }; | |||
public override string ToString() | |||
=> $"tf.RaggedTensor: shape={shape} [{string.Join(", ", _values.StringData().Take(10))}]"; | |||
public static implicit operator Tensor(RaggedTensor indexedSlices) | |||
{ | |||
return indexedSlices._values; | |||
} | |||
=> indexedSlices._to_variant(); | |||
public static implicit operator RaggedTensor(Tensor tensor) | |||
{ | |||
@@ -28,6 +28,7 @@ namespace Tensorflow | |||
public class RowPartition : CompositeTensor | |||
{ | |||
Tensor _row_splits; | |||
public Tensor row_splits => _row_splits; | |||
Tensor _row_lengths; | |||
Tensor _value_rowids; | |||
Tensor _nrows; | |||
@@ -89,5 +90,14 @@ namespace Tensorflow | |||
nrows: nrows); | |||
}); | |||
} | |||
public static RowPartition from_row_splits(Tensor row_splits, | |||
bool validate = true, TF_DataType preferred_dtype = TF_DataType.DtInvalid) | |||
{ | |||
return tf_with(ops.name_scope(null, "RowPartitionFromRowSplits"), scope => | |||
{ | |||
return new RowPartition(row_splits); | |||
}); | |||
} | |||
} | |||
} |
@@ -55,10 +55,9 @@ namespace Tensorflow.Keras.Layers | |||
if (inputs.shape.ndim > 1) | |||
input_tensor = array_ops.squeeze(inputs, axis: new[] { -1 }); | |||
if (args.Split == "whitespace") | |||
input_tensor = tf.strings.split(inputs); | |||
input_tensor = tf.strings.split(input_tensor); | |||
} | |||
return inputs; | |||
return input_tensor; | |||
} | |||
} | |||
} |
@@ -1,6 +1,8 @@ | |||
using System; | |||
using NumSharp; | |||
using System; | |||
using System.Collections.Generic; | |||
using System.Text; | |||
using static Tensorflow.Binding; | |||
namespace Tensorflow.Text.Tokenizers | |||
{ | |||
@@ -13,7 +15,31 @@ namespace Tensorflow.Text.Tokenizers | |||
/// <returns></returns> | |||
public Tensor tokenize(Tensor input) | |||
{ | |||
tokenize_with_offsets(input); | |||
throw new NotImplementedException(""); | |||
} | |||
Tensor[] tokenize_with_offsets(Tensor input) | |||
{ | |||
tf_with(ops.name_scope(null, "WhitespaceTokenize"), scope => | |||
{ | |||
_whitespace_tokenize_with_offsets_encode_decode_wrapper(input); | |||
}); | |||
throw new NotImplementedException(""); | |||
} | |||
Tensor _whitespace_tokenize_with_offsets_encode_decode_wrapper(Tensor input_tensor) | |||
{ | |||
// Decode the strings and get byte offsets | |||
var (codepoints, byte_start_offsets) = tf.strings.unicode_decode_with_offsets(input_tensor, "UTF-8"); | |||
var byte_end_offsets = array_ops.concat(new Tensor[] | |||
{ | |||
byte_start_offsets[Slice.All, new Slice(1)], | |||
math_ops.cast( | |||
array_ops.expand_dims(tf.strings.string_length(input_tensor), 1), | |||
dtypes.int64) | |||
}, 1); | |||
return input_tensor; | |||
} | |||
} | |||
} |
@@ -10,10 +10,12 @@ namespace TensorFlowNET.UnitTest.Text | |||
[TestClass] | |||
public class TokenizerTest | |||
{ | |||
[TestMethod] | |||
[TestMethod, Ignore] | |||
public void Tokenize() | |||
{ | |||
var docs = tf.constant(new[] { "Everything not saved will be lost." }); | |||
var tokenizer = text.WhitespaceTokenizer(); | |||
var tokens = tokenizer.tokenize(docs); | |||
} | |||
} | |||
} |