Browse Source

Cleaned up defaulting the string analyzer in Tokenizer.

pull/756/head
Niklas Gustafsson 4 years ago
parent
commit
13be215f73
2 changed files with 6 additions and 9 deletions
  1. +5
    -1
      src/TensorFlowNET.Keras/Preprocessings/Tokenizer.cs
  2. +1
    -8
      src/TensorFlowNET.Keras/TextApi.cs

+ 5
- 1
src/TensorFlowNET.Keras/Preprocessings/Tokenizer.cs View File

@@ -16,6 +16,10 @@ namespace Tensorflow.Keras.Text
/// (each integer being the index of a token in a dictionary) or into a vector where the coefficient for /// (each integer being the index of a token in a dictionary) or into a vector where the coefficient for
/// each token could be binary, based on word count, based on tf-idf... /// each token could be binary, based on word count, based on tf-idf...
/// </summary> /// </summary>
/// <remarks>
/// This code is a fairly straight port of the Python code for Keras text preprocessing found at:
/// https://github.com/keras-team/keras-preprocessing/blob/master/keras_preprocessing/text.py
/// </remarks>
public class Tokenizer public class Tokenizer
{ {
private readonly int num_words; private readonly int num_words;
@@ -51,7 +55,7 @@ namespace Tensorflow.Keras.Text
this.split = split; this.split = split;
this.char_level = char_level; this.char_level = char_level;
this.oov_token = oov_token; this.oov_token = oov_token;
this.analyzer = analyzer;
this.analyzer = analyzer != null ? analyzer : (text) => TextApi.text_to_word_sequence(text, filters, lower, split);
} }


/// <summary> /// <summary>


+ 1
- 8
src/TensorFlowNET.Keras/TextApi.cs View File

@@ -17,14 +17,7 @@ namespace Tensorflow.Keras
string oov_token = null, string oov_token = null,
Func<string, IEnumerable<string>> analyzer = null) Func<string, IEnumerable<string>> analyzer = null)
{ {
if (analyzer != null)
{
return new Keras.Text.Tokenizer(num_words, filters, lower, split, char_level, oov_token, analyzer);
}
else
{
return new Keras.Text.Tokenizer(num_words, filters, lower, split, char_level, oov_token, (text) => text_to_word_sequence(text, filters, lower, split));
}
return new Keras.Text.Tokenizer(num_words, filters, lower, split, char_level, oov_token, analyzer);
} }


public static IEnumerable<string> text_to_word_sequence(string text, string filters = DefaultFilter, bool lower = true, char split = ' ') public static IEnumerable<string> text_to_word_sequence(string text, string filters = DefaultFilter, bool lower = true, char split = ' ')


Loading…
Cancel
Save