Merge pull request !2433 from qianlong21st/add_text_commenttags/v0.5.0-beta
| @@ -11,9 +11,11 @@ | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
| # See the License for the specific language governing permissions and | # See the License for the specific language governing permissions and | ||||
| # limitations under the License. | # limitations under the License. | ||||
| """ | """ | ||||
| mindspore.dataset.text | |||||
| This module is to support text processing for nlp. It includes two parts: | |||||
| transforms and utils. transforms is a high performance | |||||
| nlp text processing module which is developed with icu4c and cppjieba. | |||||
| utils provides some general methods for nlp text processing. | |||||
| """ | """ | ||||
| import platform | import platform | ||||
| from .transforms import Lookup, JiebaTokenizer, UnicodeCharTokenizer, Ngram, WordpieceTokenizer, TruncateSequencePair, \ | from .transforms import Lookup, JiebaTokenizer, UnicodeCharTokenizer, Ngram, WordpieceTokenizer, TruncateSequencePair, \ | ||||
| @@ -12,9 +12,37 @@ | |||||
| # See the License for the specific language governing permissions and | # See the License for the specific language governing permissions and | ||||
| # limitations under the License. | # limitations under the License. | ||||
| """ | """ | ||||
| c transforms for all text related operators | |||||
| The module text.transforms is inheritted from _c_dataengine | |||||
| which is implemented basing on icu4c and cppjieba in C++. | |||||
| It's a high performance module to process nlp text. | |||||
| Users can use Vocab to build their own dictionary, | |||||
| use appropriate tokenizers to split sentences into different tokens, | |||||
| and use Lookup to find the index of tokens in Vocab. | |||||
| .. Note:: | |||||
| Constructor's arguments for every class in this module must be saved into the | |||||
| class attributes (self.xxx) to support save() and load(). | |||||
| Examples: | |||||
| >>> import mindspore.dataset as ds | |||||
| >>> import mindspore.dataset.text as text | |||||
| >>> dataset_file = "path/to/text_file_path" | |||||
| >>> # sentences as line data saved in a file | |||||
| >>> dataset = ds.TextFileDataset(dataset_file, shuffle=False) | |||||
| >>> # tokenize sentence to unicode characters | |||||
| >>> tokenizer = text.UnicodeCharTokenizer() | |||||
| >>> # load vocabulary form list | |||||
| >>> vocab = text.Vocab.from_list(['深', '圳', '欢', '迎', '您']) | |||||
| >>> # lookup is an operation for mapping tokens to ids | |||||
| >>> lookup = text.Lookup(vocab) | |||||
| >>> dataset = dataset.map(operations=[tokenizer, lookup]) | |||||
| >>> for i in dataset.create_dict_iterator(): | |||||
| >>> print(i) | |||||
| >>> # if text line in dataset_file is: | |||||
| >>> # 深圳欢迎您 | |||||
| >>> # then the output will be: | |||||
| >>> # {'text': array([0, 1, 2, 3, 4], dtype=int32)} | |||||
| """ | """ | ||||
| import os | import os | ||||
| import re | import re | ||||
| import platform | import platform | ||||
| @@ -203,8 +231,8 @@ class WordpieceTokenizer(cde.WordpieceTokenizerOp): | |||||
| Args: | Args: | ||||
| vocab (Vocab): a Vocab object. | vocab (Vocab): a Vocab object. | ||||
| suffix_indicator (str, optional): Used to show that the subword is the last part of a word(default '##'). | |||||
| max_bytes_per_token (int, optional): Tokens exceeding this length will not be further split(default 100). | |||||
| suffix_indicator (str, optional): Used to show that the subword is the last part of a word(default='##'). | |||||
| max_bytes_per_token (int, optional): Tokens exceeding this length will not be further split(default=100). | |||||
| unknown_token (str, optional): When we can not found the token: if 'unknown_token' is empty string, | unknown_token (str, optional): When we can not found the token: if 'unknown_token' is empty string, | ||||
| return the token directly, else return 'unknown_token'(default='[UNK]'). | return the token directly, else return 'unknown_token'(default='[UNK]'). | ||||
| """ | """ | ||||
| @@ -299,7 +327,7 @@ if platform.system().lower() != 'windows': | |||||
| The original string will be split by matched elements. | The original string will be split by matched elements. | ||||
| keep_delim_pattern(str, optional): The string matched by 'delim_pattern' can be kept as a token | keep_delim_pattern(str, optional): The string matched by 'delim_pattern' can be kept as a token | ||||
| if it can be matched by 'keep_delim_pattern'. And the default value is empty str(''), | if it can be matched by 'keep_delim_pattern'. And the default value is empty str(''), | ||||
| in this situation, delimiters will not kept as a output token. | |||||
| in this situation, delimiters will not kept as a output token(default=''). | |||||
| """ | """ | ||||
| def __init__(self, delim_pattern, keep_delim_pattern=''): | def __init__(self, delim_pattern, keep_delim_pattern=''): | ||||
| @@ -12,7 +12,9 @@ | |||||
| # See the License for the specific language governing permissions and | # See the License for the specific language governing permissions and | ||||
| # limitations under the License. | # limitations under the License. | ||||
| """ | """ | ||||
| Some basic function for text | |||||
| The module text.utils provides some general methods for nlp text processing. | |||||
| For example, you can use Vocab to build a dictionary, | |||||
| use to_bytes and to_str to encode and decode strings into a specified format. | |||||
| """ | """ | ||||
| from enum import IntEnum | from enum import IntEnum | ||||
| @@ -52,12 +54,12 @@ class Vocab(cde.Vocab): | |||||
| min_frequency/max_frequency can be None, which corresponds to 0/total_words separately | min_frequency/max_frequency can be None, which corresponds to 0/total_words separately | ||||
| (default=None, all words are included). | (default=None, all words are included). | ||||
| top_k(int, optional): top_k > 0. Number of words to be built into vocab. top_k most frequent words are | top_k(int, optional): top_k > 0. Number of words to be built into vocab. top_k most frequent words are | ||||
| taken. top_k is taken after freq_range. If not enough top_k, all words will be taken. (default=None, | |||||
| taken. top_k is taken after freq_range. If not enough top_k, all words will be taken (default=None, | |||||
| all words are included). | all words are included). | ||||
| special_tokens(list, optional): a list of strings, each one is a special token. for example | special_tokens(list, optional): a list of strings, each one is a special token. for example | ||||
| special_tokens=["<pad>","<unk>"] (default=None, no special tokens will be added). | special_tokens=["<pad>","<unk>"] (default=None, no special tokens will be added). | ||||
| special_first(bool, optional): whether special_tokens will be prepended/appended to vocab. If special_tokens | special_first(bool, optional): whether special_tokens will be prepended/appended to vocab. If special_tokens | ||||
| is specified and special_first is set to None, special_tokens will be prepended. (default=None). | |||||
| is specified and special_first is set to None, special_tokens will be prepended (default=None). | |||||
| Returns: | Returns: | ||||
| Vocab, Vocab object built from dataset. | Vocab, Vocab object built from dataset. | ||||
| @@ -81,7 +83,7 @@ class Vocab(cde.Vocab): | |||||
| special_tokens(list, optional): a list of strings, each one is a special token. for example | special_tokens(list, optional): a list of strings, each one is a special token. for example | ||||
| special_tokens=["<pad>","<unk>"] (default=None, no special tokens will be added). | special_tokens=["<pad>","<unk>"] (default=None, no special tokens will be added). | ||||
| special_first(bool, optional): whether special_tokens will be prepended/appended to vocab, If special_tokens | special_first(bool, optional): whether special_tokens will be prepended/appended to vocab, If special_tokens | ||||
| is specified and special_first is set to None, special_tokens will be prepended. (default=None). | |||||
| is specified and special_first is set to None, special_tokens will be prepended (default=None). | |||||
| """ | """ | ||||
| return super().from_list(word_list, special_tokens, special_first) | return super().from_list(word_list, special_tokens, special_first) | ||||
| @@ -101,7 +103,7 @@ class Vocab(cde.Vocab): | |||||
| special_tokens=["<pad>","<unk>"] (default=None, no special tokens will be added). | special_tokens=["<pad>","<unk>"] (default=None, no special tokens will be added). | ||||
| special_first (bool, optional): whether special_tokens will be prepended/appended to vocab, | special_first (bool, optional): whether special_tokens will be prepended/appended to vocab, | ||||
| If special_tokens is specified and special_first is set to None, | If special_tokens is specified and special_first is set to None, | ||||
| special_tokens will be prepended. (default=None). | |||||
| special_tokens will be prepended (default=None). | |||||
| """ | """ | ||||
| return super().from_file(file_path, delimiter, vocab_size, special_tokens, special_first) | return super().from_file(file_path, delimiter, vocab_size, special_tokens, special_first) | ||||
| @@ -157,12 +159,14 @@ def to_bytes(array, encoding='utf8'): | |||||
| class JiebaMode(IntEnum): | class JiebaMode(IntEnum): | ||||
| """An enumeration for JiebaTokenizer, effective enumeration types are MIX, MP, HMM.""" | |||||
| MIX = 0 | MIX = 0 | ||||
| MP = 1 | MP = 1 | ||||
| HMM = 2 | HMM = 2 | ||||
| class NormalizeForm(IntEnum): | class NormalizeForm(IntEnum): | ||||
| """An enumeration for NormalizeUTF8, effective enumeration types are NONE, NFC, NFKC, NFD, NFKD.""" | |||||
| NONE = 0 | NONE = 0 | ||||
| NFC = 1 | NFC = 1 | ||||
| NFKC = 2 | NFKC = 2 | ||||