@@ -16,6 +16,10 @@ namespace Tensorflow.Keras.Text | |||||
/// (each integer being the index of a token in a dictionary) or into a vector where the coefficient for | /// (each integer being the index of a token in a dictionary) or into a vector where the coefficient for | ||||
/// each token could be binary, based on word count, based on tf-idf... | /// each token could be binary, based on word count, based on tf-idf... | ||||
/// </summary> | /// </summary> | ||||
/// <remarks> | |||||
/// This code is a fairly straight port of the Python code for Keras text preprocessing found at: | |||||
/// https://github.com/keras-team/keras-preprocessing/blob/master/keras_preprocessing/text.py | |||||
/// </remarks> | |||||
public class Tokenizer | public class Tokenizer | ||||
{ | { | ||||
private readonly int num_words; | private readonly int num_words; | ||||
@@ -51,7 +55,7 @@ namespace Tensorflow.Keras.Text | |||||
this.split = split; | this.split = split; | ||||
this.char_level = char_level; | this.char_level = char_level; | ||||
this.oov_token = oov_token; | this.oov_token = oov_token; | ||||
this.analyzer = analyzer; | |||||
this.analyzer = analyzer != null ? analyzer : (text) => TextApi.text_to_word_sequence(text, filters, lower, split); | |||||
} | } | ||||
/// <summary> | /// <summary> | ||||
@@ -17,14 +17,7 @@ namespace Tensorflow.Keras | |||||
string oov_token = null, | string oov_token = null, | ||||
Func<string, IEnumerable<string>> analyzer = null) | Func<string, IEnumerable<string>> analyzer = null) | ||||
{ | { | ||||
if (analyzer != null) | |||||
{ | |||||
return new Keras.Text.Tokenizer(num_words, filters, lower, split, char_level, oov_token, analyzer); | |||||
} | |||||
else | |||||
{ | |||||
return new Keras.Text.Tokenizer(num_words, filters, lower, split, char_level, oov_token, (text) => text_to_word_sequence(text, filters, lower, split)); | |||||
} | |||||
return new Keras.Text.Tokenizer(num_words, filters, lower, split, char_level, oov_token, analyzer); | |||||
} | } | ||||
public static IEnumerable<string> text_to_word_sequence(string text, string filters = DefaultFilter, bool lower = true, char split = ' ') | public static IEnumerable<string> text_to_word_sequence(string text, string filters = DefaultFilter, bool lower = true, char split = ' ') | ||||