Merge pull request !2433 from qianlong21st/add_text_commenttags/v0.5.0-beta
| @@ -11,9 +11,11 @@ | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| """ | |||
| mindspore.dataset.text | |||
| This module is to support text processing for nlp. It includes two parts: | |||
| transforms and utils. transforms is a high performance | |||
| nlp text processing module which is developed with icu4c and cppjieba. | |||
| utils provides some general methods for nlp text processing. | |||
| """ | |||
| import platform | |||
| from .transforms import Lookup, JiebaTokenizer, UnicodeCharTokenizer, Ngram, WordpieceTokenizer, TruncateSequencePair, \ | |||
| @@ -12,9 +12,37 @@ | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| """ | |||
| c transforms for all text related operators | |||
| The module text.transforms is inheritted from _c_dataengine | |||
| which is implemented basing on icu4c and cppjieba in C++. | |||
| It's a high performance module to process nlp text. | |||
| Users can use Vocab to build their own dictionary, | |||
| use appropriate tokenizers to split sentences into different tokens, | |||
| and use Lookup to find the index of tokens in Vocab. | |||
| .. Note:: | |||
| Constructor's arguments for every class in this module must be saved into the | |||
| class attributes (self.xxx) to support save() and load(). | |||
| Examples: | |||
| >>> import mindspore.dataset as ds | |||
| >>> import mindspore.dataset.text as text | |||
| >>> dataset_file = "path/to/text_file_path" | |||
| >>> # sentences as line data saved in a file | |||
| >>> dataset = ds.TextFileDataset(dataset_file, shuffle=False) | |||
| >>> # tokenize sentence to unicode characters | |||
| >>> tokenizer = text.UnicodeCharTokenizer() | |||
| >>> # load vocabulary form list | |||
| >>> vocab = text.Vocab.from_list(['深', '圳', '欢', '迎', '您']) | |||
| >>> # lookup is an operation for mapping tokens to ids | |||
| >>> lookup = text.Lookup(vocab) | |||
| >>> dataset = dataset.map(operations=[tokenizer, lookup]) | |||
| >>> for i in dataset.create_dict_iterator(): | |||
| >>> print(i) | |||
| >>> # if text line in dataset_file is: | |||
| >>> # 深圳欢迎您 | |||
| >>> # then the output will be: | |||
| >>> # {'text': array([0, 1, 2, 3, 4], dtype=int32)} | |||
| """ | |||
| import os | |||
| import re | |||
| import platform | |||
| @@ -203,8 +231,8 @@ class WordpieceTokenizer(cde.WordpieceTokenizerOp): | |||
| Args: | |||
| vocab (Vocab): a Vocab object. | |||
| suffix_indicator (str, optional): Used to show that the subword is the last part of a word(default '##'). | |||
| max_bytes_per_token (int, optional): Tokens exceeding this length will not be further split(default 100). | |||
| suffix_indicator (str, optional): Used to show that the subword is the last part of a word(default='##'). | |||
| max_bytes_per_token (int, optional): Tokens exceeding this length will not be further split(default=100). | |||
| unknown_token (str, optional): When we can not found the token: if 'unknown_token' is empty string, | |||
| return the token directly, else return 'unknown_token'(default='[UNK]'). | |||
| """ | |||
| @@ -299,7 +327,7 @@ if platform.system().lower() != 'windows': | |||
| The original string will be split by matched elements. | |||
| keep_delim_pattern(str, optional): The string matched by 'delim_pattern' can be kept as a token | |||
| if it can be matched by 'keep_delim_pattern'. And the default value is empty str(''), | |||
| in this situation, delimiters will not kept as a output token. | |||
| in this situation, delimiters will not kept as a output token(default=''). | |||
| """ | |||
| def __init__(self, delim_pattern, keep_delim_pattern=''): | |||
| @@ -12,7 +12,9 @@ | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| """ | |||
| Some basic function for text | |||
| The module text.utils provides some general methods for nlp text processing. | |||
| For example, you can use Vocab to build a dictionary, | |||
| use to_bytes and to_str to encode and decode strings into a specified format. | |||
| """ | |||
| from enum import IntEnum | |||
| @@ -52,12 +54,12 @@ class Vocab(cde.Vocab): | |||
| min_frequency/max_frequency can be None, which corresponds to 0/total_words separately | |||
| (default=None, all words are included). | |||
| top_k(int, optional): top_k > 0. Number of words to be built into vocab. top_k most frequent words are | |||
| taken. top_k is taken after freq_range. If not enough top_k, all words will be taken. (default=None, | |||
| taken. top_k is taken after freq_range. If not enough top_k, all words will be taken (default=None, | |||
| all words are included). | |||
| special_tokens(list, optional): a list of strings, each one is a special token. for example | |||
| special_tokens=["<pad>","<unk>"] (default=None, no special tokens will be added). | |||
| special_first(bool, optional): whether special_tokens will be prepended/appended to vocab. If special_tokens | |||
| is specified and special_first is set to None, special_tokens will be prepended. (default=None). | |||
| is specified and special_first is set to None, special_tokens will be prepended (default=None). | |||
| Returns: | |||
| Vocab, Vocab object built from dataset. | |||
| @@ -81,7 +83,7 @@ class Vocab(cde.Vocab): | |||
| special_tokens(list, optional): a list of strings, each one is a special token. for example | |||
| special_tokens=["<pad>","<unk>"] (default=None, no special tokens will be added). | |||
| special_first(bool, optional): whether special_tokens will be prepended/appended to vocab, If special_tokens | |||
| is specified and special_first is set to None, special_tokens will be prepended. (default=None). | |||
| is specified and special_first is set to None, special_tokens will be prepended (default=None). | |||
| """ | |||
| return super().from_list(word_list, special_tokens, special_first) | |||
| @@ -101,7 +103,7 @@ class Vocab(cde.Vocab): | |||
| special_tokens=["<pad>","<unk>"] (default=None, no special tokens will be added). | |||
| special_first (bool, optional): whether special_tokens will be prepended/appended to vocab, | |||
| If special_tokens is specified and special_first is set to None, | |||
| special_tokens will be prepended. (default=None). | |||
| special_tokens will be prepended (default=None). | |||
| """ | |||
| return super().from_file(file_path, delimiter, vocab_size, special_tokens, special_first) | |||
| @@ -157,12 +159,14 @@ def to_bytes(array, encoding='utf8'): | |||
| class JiebaMode(IntEnum): | |||
| """An enumeration for JiebaTokenizer, effective enumeration types are MIX, MP, HMM.""" | |||
| MIX = 0 | |||
| MP = 1 | |||
| HMM = 2 | |||
| class NormalizeForm(IntEnum): | |||
| """An enumeration for NormalizeUTF8, effective enumeration types are NONE, NFC, NFKC, NFD, NFKD.""" | |||
| NONE = 0 | |||
| NFC = 1 | |||
| NFKC = 2 | |||