Signed-off-by: alex-yuyue <yue.yu1@huawei.com>pull/13364/head
| @@ -134,6 +134,15 @@ inline std::vector<std::pair<std::string, std::vector<int32_t>>> ClassIndexCharT | |||
| return ret; | |||
| } | |||
| inline std::vector<std::pair<std::vector<char>, int64_t>> PairStringInt64ToPairCharInt64( | |||
| const std::vector<std::pair<std::string, int64_t>> &s) { | |||
| std::vector<std::pair<std::vector<char>, int64_t>> ret; | |||
| std::transform(s.begin(), s.end(), std::back_inserter(ret), [](auto str) { | |||
| return std::pair<std::vector<char>, int64_t>(std::vector<char>(str.first.begin(), str.first.end()), str.second); | |||
| }); | |||
| return ret; | |||
| } | |||
| template <class T> | |||
| inline std::map<std::vector<char>, T> PadInfoStringToChar(const std::map<std::string, T> &s_pad_info) { | |||
| std::map<std::vector<char>, T> ret; | |||
| @@ -232,12 +232,17 @@ PYBIND_REGISTER(UnicodeCharTokenizerOperation, 1, ([](const py::module *m) { | |||
| })); | |||
| })); | |||
| // TODO(alexyuyue): Need to decouple WordpieceTokenizerOp to WordpieceTokenizerOperation after it's supported in C++ | |||
| PYBIND_REGISTER(WordpieceTokenizerOp, 1, ([](const py::module *m) { | |||
| (void)py::class_<WordpieceTokenizerOp, TensorOp, std::shared_ptr<WordpieceTokenizerOp>>( | |||
| *m, "WordpieceTokenizerOp") | |||
| .def(py::init<const std::shared_ptr<Vocab> &, const std::string &, const int &, const std::string &, | |||
| const bool &>()); | |||
| PYBIND_REGISTER(WordpieceTokenizerOperation, 1, ([](const py::module *m) { | |||
| (void)py::class_<text::WordpieceTokenizerOperation, TensorOperation, | |||
| std::shared_ptr<text::WordpieceTokenizerOperation>>(*m, | |||
| "WordpieceTokenizerOperation") | |||
| .def(py::init([](const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator, | |||
| int32_t max_bytes_per_token, const std::string &unknown_token, bool with_offsets) { | |||
| auto wordpiece_tokenizer = std::make_shared<text::WordpieceTokenizerOperation>( | |||
| vocab, suffix_indicator, max_bytes_per_token, unknown_token, with_offsets); | |||
| THROW_IF_ERROR(wordpiece_tokenizer->ValidateParams()); | |||
| return wordpiece_tokenizer; | |||
| })); | |||
| })); | |||
| PYBIND_REGISTER(JiebaMode, 0, ([](const py::module *m) { | |||
| @@ -15,6 +15,8 @@ | |||
| */ | |||
| #include <unistd.h> | |||
| #include <fstream> | |||
| #include <regex> | |||
| #include "minddata/dataset/include/text.h" | |||
| @@ -131,7 +133,7 @@ std::shared_ptr<TensorOperation> JiebaTokenizer::Parse() { | |||
| return jieba_tokenizer; | |||
| } | |||
| Status JiebaTokenizer::AddWord(const std::string &word, int64_t freq) { | |||
| Status JiebaTokenizer::AddWordChar(const std::vector<char> &word, int64_t freq) { | |||
| if (word.empty()) { | |||
| std::string err_msg = "JiebaTokenizer : The parameter word is empty or not provided."; | |||
| MS_LOG(ERROR) << err_msg; | |||
| @@ -142,7 +144,59 @@ Status JiebaTokenizer::AddWord(const std::string &word, int64_t freq) { | |||
| MS_LOG(ERROR) << err_msg; | |||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||
| } | |||
| data_->words_list_.emplace_back(word, freq); | |||
| data_->words_list_.emplace_back(CharToString(word), freq); | |||
| return Status::OK(); | |||
| } | |||
| Status JiebaTokenizer::AddDictChar(const std::vector<std::pair<std::vector<char>, int64_t>> &user_dict) { | |||
| for (auto &word_freq_pair : user_dict) { | |||
| RETURN_IF_NOT_OK(AddWordChar(word_freq_pair.first, word_freq_pair.second)); | |||
| } | |||
| return Status::OK(); | |||
| } | |||
| Status JiebaTokenizer::AddDictChar(const std::vector<char> &file_path) { | |||
| std::vector<std::pair<std::string, int64_t>> user_dict; | |||
| RETURN_IF_NOT_OK(ParserFile(CharToString(file_path), &user_dict)); | |||
| RETURN_IF_NOT_OK(AddDictChar(PairStringInt64ToPairCharInt64(user_dict))); | |||
| return Status::OK(); | |||
| } | |||
| Status JiebaTokenizer::ParserFile(const std::string &file_path, | |||
| std::vector<std::pair<std::string, int64_t>> *const user_dict) { | |||
| std::ifstream ifs(file_path); | |||
| if (!ifs) { | |||
| std::string err_msg = "JiebaTokenizer : Fail to load dictionary from the input file, check the file path."; | |||
| MS_LOG(ERROR) << err_msg; | |||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||
| } | |||
| std::string line; | |||
| while (std::getline(ifs, line)) { | |||
| if (line.empty()) { | |||
| continue; | |||
| } | |||
| std::regex regex("^\\s*([^\\s*]+?)\\s*([0-9]+)?\\s*$"); | |||
| std::smatch tokens; | |||
| std::regex_match(line, tokens, regex); | |||
| if (std::regex_match(line, tokens, regex)) { | |||
| if (tokens.size() == 2) { | |||
| user_dict->emplace_back(tokens.str(1), 0); | |||
| } else if (tokens.size() == 3) { | |||
| user_dict->emplace_back(tokens.str(1), strtoll(tokens.str(2).c_str(), NULL, 0)); | |||
| } else { | |||
| continue; | |||
| } | |||
| } else { | |||
| continue; | |||
| } | |||
| } | |||
| MS_LOG(INFO) << "JiebaTokenizer::AddDict: The size of user input dictionary is: " << user_dict->size(); | |||
| MS_LOG(INFO) << "Valid rows in input dictionary (Maximum of first 10 rows are shown.):"; | |||
| for (std::size_t i = 0; i != user_dict->size(); ++i) { | |||
| if (i >= 10) break; | |||
| MS_LOG(INFO) << user_dict->at(i).first << " " << user_dict->at(i).second; | |||
| } | |||
| return Status::OK(); | |||
| } | |||
| @@ -310,6 +364,32 @@ std::shared_ptr<TensorOperation> UnicodeCharTokenizer::Parse() { | |||
| return std::make_shared<UnicodeCharTokenizerOperation>(data_->with_offsets_); | |||
| } | |||
| // WordpieceTokenizer | |||
| struct WordpieceTokenizer::Data { | |||
| Data(const std::shared_ptr<Vocab> &vocab, const std::vector<char> &suffix_indicator, int32_t max_bytes_per_token, | |||
| const std::vector<char> &unknown_token, bool with_offsets) | |||
| : vocab_(vocab), | |||
| suffix_indicator_(CharToString(suffix_indicator)), | |||
| max_bytes_per_token_(max_bytes_per_token), | |||
| unknown_token_(CharToString(unknown_token)), | |||
| with_offsets_(with_offsets) {} | |||
| std::shared_ptr<Vocab> vocab_; | |||
| std::string suffix_indicator_; | |||
| int32_t max_bytes_per_token_; | |||
| std::string unknown_token_; | |||
| bool with_offsets_; | |||
| }; | |||
| WordpieceTokenizer::WordpieceTokenizer(const std::shared_ptr<Vocab> &vocab, const std::vector<char> &suffix_indicator, | |||
| int32_t max_bytes_per_token, const std::vector<char> &unknown_token, | |||
| bool with_offsets) | |||
| : data_(std::make_shared<Data>(vocab, suffix_indicator, max_bytes_per_token, unknown_token, with_offsets)) {} | |||
| std::shared_ptr<TensorOperation> WordpieceTokenizer::Parse() { | |||
| return std::make_shared<WordpieceTokenizerOperation>( | |||
| data_->vocab_, data_->suffix_indicator_, data_->max_bytes_per_token_, data_->unknown_token_, data_->with_offsets_); | |||
| } | |||
| #ifndef _WIN32 | |||
| // UnicodeScriptTokenizer | |||
| struct UnicodeScriptTokenizer::Data { | |||
| @@ -52,7 +52,7 @@ class BasicTokenizer final : public TensorTransform { | |||
| /// false. See NormalizeUTF8 for details (default=NormalizeForm::kNone). | |||
| /// \param[in] preserve_unused_token If true, do not split special tokens like '[CLS]', '[SEP]', '[UNK]', '[PAD]', | |||
| /// '[MASK]' (default=true). | |||
| /// \param[in] with_offsets If or not output offsets of tokens (default=false). | |||
| /// \param[in] with_offsets Whether or not output offsets of tokens (default=false). | |||
| explicit BasicTokenizer(bool lower_case = false, bool keep_whitespace = false, | |||
| const NormalizeForm normalize_form = NormalizeForm::kNone, bool preserve_unused_token = true, | |||
| bool with_offsets = false); | |||
| @@ -88,7 +88,7 @@ class BertTokenizer final : public TensorTransform { | |||
| /// false. See NormalizeUTF8 for details (default=NormalizeForm::kNone). | |||
| /// \param[in] preserve_unused_token If true, do not split special tokens like '[CLS]', '[SEP]', '[UNK]', '[PAD]', | |||
| /// '[MASK]' (default=true). | |||
| /// \param[in] with_offsets If or not output offsets of tokens (default=false). | |||
| /// \param[in] with_offsets Whether or not output offsets of tokens (default=false). | |||
| explicit BertTokenizer(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator = "##", | |||
| int32_t max_bytes_per_token = 100, const std::string &unknown_token = "[UNK]", | |||
| bool lower_case = false, bool keep_whitespace = false, | |||
| @@ -145,7 +145,7 @@ class JiebaTokenizer final : public TensorTransform { | |||
| /// - JiebaMode.kMP, tokenize with MPSegment algorithm. | |||
| /// - JiebaMode.kHMM, tokenize with Hiddel Markov Model Segment algorithm. | |||
| /// - JiebaMode.kMIX, tokenize with a mix of MPSegment and HMMSegment algorithm. | |||
| /// \param[in] with_offsets If or not output offsets of tokens (default=false). | |||
| /// \param[in] with_offsets Whether or not output offsets of tokens (default=false). | |||
| explicit JiebaTokenizer(const std::string &hmm_path, const std::string &mp_path, | |||
| const JiebaMode &mode = JiebaMode::kMix, bool with_offsets = false) | |||
| : JiebaTokenizer(StringToChar(hmm_path), StringToChar(mp_path), mode, with_offsets) {} | |||
| @@ -156,7 +156,24 @@ class JiebaTokenizer final : public TensorTransform { | |||
| /// \brief Destructor | |||
| ~JiebaTokenizer() = default; | |||
| Status AddWord(const std::string &word, int64_t freq = 0); | |||
| /// \brief Add user defined word to JiebaTokenizer's dictionary. | |||
| /// \param[in] word The word to be added to the JiebaTokenizer instance. | |||
| /// The added word will not be written into the built-in dictionary on disk. | |||
| /// \param[in] freq The frequency of the word to be added. The higher the frequency, | |||
| /// the better chance the word will be tokenized (default=None, use default frequency). | |||
| Status AddWord(const std::string &word, int64_t freq = 0) { return AddWordChar(StringToChar(word), freq); } | |||
| /// \brief Add user defined dictionary of word-freq pairs to JiebaTokenizer's dictionary. | |||
| /// \param[in] user_dict Vector of word-freq pairs to be added to JiebaTokenizer's dictionary. | |||
| Status AddDict(const std::vector<std::pair<std::string, int64_t>> &user_dict) { | |||
| return AddDictChar(PairStringInt64ToPairCharInt64(user_dict)); | |||
| } | |||
| /// \brief Add user defined dictionary of word-freq pairs to JiebaTokenizer's dictionary from a file. | |||
| /// Only valid word-freq pairs in user provided file will be added into the dictionary. | |||
| /// Rows containing invalid input will be ignored, no error nor warning Status is returned. | |||
| /// \param[in] file_path Path to the dictionary which includes user defined word-freq pairs. | |||
| Status AddDict(const std::string &file_path) { return AddDictChar(StringToChar(file_path)); } | |||
| protected: | |||
| /// \brief Function to convert TensorTransform object into a TensorOperation object. | |||
| @@ -164,6 +181,20 @@ class JiebaTokenizer final : public TensorTransform { | |||
| std::shared_ptr<TensorOperation> Parse() override; | |||
| private: | |||
| /// \brief Parser user defined word by file. | |||
| /// \param[in] file_path Path to the user defined file. | |||
| /// \param[in] user_dict Vector of word-freq pairs extracted from the user provided file. | |||
| Status ParserFile(const std::string &file_path, std::vector<std::pair<std::string, int64_t>> *const user_dict); | |||
| /// \brief Used to translate all API string to vector of char and back | |||
| Status AddWordChar(const std::vector<char> &word, int64_t freq = 0); | |||
| /// \brief Used to translate all API string to vector of char and back | |||
| Status AddDictChar(const std::vector<std::pair<std::vector<char>, int64_t>> &user_dict); | |||
| /// \brief Used to translate all API string to vector of char and back | |||
| Status AddDictChar(const std::vector<char> &file_path); | |||
| struct Data; | |||
| std::shared_ptr<Data> data_; | |||
| }; | |||
| @@ -292,7 +323,7 @@ class RegexTokenizer final : public TensorTransform { | |||
| /// \param[in] keep_delim_pattern The string matched by 'delim_pattern' can be kept as a token if it can be | |||
| /// matched by 'keep_delim_pattern'. The default value is an empty string ("") | |||
| /// which means that delimiters will not be kept as an output token (default=""). | |||
| /// \param[in] with_offsets If or not output offsets of tokens (default=false). | |||
| /// \param[in] with_offsets Whether or not output offsets of tokens (default=false). | |||
| explicit RegexTokenizer(std::string delim_pattern, std::string keep_delim_pattern = "", bool with_offsets = false) | |||
| : RegexTokenizer(StringToChar(delim_pattern), StringToChar(keep_delim_pattern), with_offsets) {} | |||
| @@ -416,7 +447,7 @@ class TruncateSequencePair final : public TensorTransform { | |||
| class UnicodeCharTokenizer final : public TensorTransform { | |||
| public: | |||
| /// \brief Constructor. | |||
| /// \param[in] with_offsets If or not output offsets of tokens (default=false). | |||
| /// \param[in] with_offsets Whether or not output offsets of tokens (default=false). | |||
| explicit UnicodeCharTokenizer(bool with_offsets = false); | |||
| /// \brief Destructor | |||
| @@ -432,13 +463,45 @@ class UnicodeCharTokenizer final : public TensorTransform { | |||
| std::shared_ptr<Data> data_; | |||
| }; | |||
| /// \brief Tokenize scalar token or 1-D tokens to 1-D subword tokens. | |||
| class WordpieceTokenizer final : public TensorTransform { | |||
| public: | |||
| /// \brief Constructor. | |||
| /// \param[in] vocab A Vocab object. | |||
| /// \param[in] suffix_indicator Used to show that the subword is the last part of a word (default='##'). | |||
| /// \param[in] max_bytes_per_token Tokens exceeding this length will not be further split (default=100). | |||
| /// \param[in] unknown_token When a token cannot be found, return the token directly if 'unknown_token' is an empty | |||
| /// string, else return the string specified (default='[UNK]'). | |||
| /// \param[in] with_offsets Whether or not output offsets of tokens (default=false). | |||
| explicit WordpieceTokenizer(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator = "##", | |||
| int32_t max_bytes_per_token = 100, const std::string &unknown_token = "[UNK]", | |||
| bool with_offsets = false) | |||
| : WordpieceTokenizer(vocab, StringToChar(suffix_indicator), max_bytes_per_token, StringToChar(unknown_token), | |||
| with_offsets) {} | |||
| explicit WordpieceTokenizer(const std::shared_ptr<Vocab> &vocab, const std::vector<char> &suffix_indicator, | |||
| int32_t max_bytes_per_token, const std::vector<char> &unknown_token, bool with_offsets); | |||
| /// \brief Destructor | |||
| ~WordpieceTokenizer() = default; | |||
| protected: | |||
| /// \brief Function to convert TensorTransform object into a TensorOperation object. | |||
| /// \return Shared pointer to TensorOperation object. | |||
| std::shared_ptr<TensorOperation> Parse() override; | |||
| private: | |||
| struct Data; | |||
| std::shared_ptr<Data> data_; | |||
| }; | |||
| #ifndef _WIN32 | |||
| /// \brief Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries. | |||
| class UnicodeScriptTokenizer final : public TensorTransform { | |||
| public: | |||
| /// \brief Constructor. | |||
| /// \param[in] keep_whitespace If or not emit whitespace tokens (default=false). | |||
| /// \param[in] with_offsets If or not output offsets of tokens (default=false). | |||
| /// \param[in] keep_whitespace Whether or not emit whitespace tokens (default=false). | |||
| /// \param[in] with_offsets Whether or not output offsets of tokens (default=false). | |||
| explicit UnicodeScriptTokenizer(bool keep_whitespace = false, bool with_offsets = false); | |||
| /// \brief Destructor | |||
| @@ -458,7 +521,7 @@ class UnicodeScriptTokenizer final : public TensorTransform { | |||
| class WhitespaceTokenizer final : public TensorTransform { | |||
| public: | |||
| /// \brief Constructor. | |||
| /// \param[in] with_offsets If or not output offsets of tokens (default=false). | |||
| /// \param[in] with_offsets Whether or not output offsets of tokens (default=false). | |||
| explicit WhitespaceTokenizer(bool with_offsets = false); | |||
| /// \brief Destructor | |||
| @@ -36,6 +36,7 @@ | |||
| #include "minddata/dataset/text/kernels/to_number_op.h" | |||
| #include "minddata/dataset/text/kernels/truncate_sequence_pair_op.h" | |||
| #include "minddata/dataset/text/kernels/unicode_char_tokenizer_op.h" | |||
| #include "minddata/dataset/text/kernels/wordpiece_tokenizer_op.h" | |||
| #ifndef _WIN32 | |||
| #include "minddata/dataset/text/kernels/unicode_script_tokenizer_op.h" | |||
| #include "minddata/dataset/text/kernels/whitespace_tokenizer_op.h" | |||
| @@ -396,6 +397,39 @@ std::shared_ptr<TensorOp> UnicodeCharTokenizerOperation::Build() { | |||
| return tensor_op; | |||
| } | |||
| // WordpieceTokenizerOperation | |||
| WordpieceTokenizerOperation::WordpieceTokenizerOperation(const std::shared_ptr<Vocab> &vocab, | |||
| const std::string &suffix_indicator, | |||
| int32_t max_bytes_per_token, const std::string &unknown_token, | |||
| bool with_offsets) | |||
| : vocab_(vocab), | |||
| suffix_indicator_(suffix_indicator), | |||
| max_bytes_per_token_(max_bytes_per_token), | |||
| unknown_token_(unknown_token), | |||
| with_offsets_(with_offsets) {} | |||
| Status WordpieceTokenizerOperation::ValidateParams() { | |||
| if (vocab_ == nullptr) { | |||
| std::string err_msg = "WordpieceTokenizer: vocab object type is incorrect or null."; | |||
| MS_LOG(ERROR) << err_msg; | |||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||
| } | |||
| if (max_bytes_per_token_ < 0) { | |||
| std::string err_msg = | |||
| "WordpieceTokenizer : The parameter max_bytes_per_token must be greater than or equal to 0: " + | |||
| std::to_string(max_bytes_per_token_); | |||
| MS_LOG(ERROR) << err_msg; | |||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||
| } | |||
| return Status::OK(); | |||
| } | |||
| std::shared_ptr<TensorOp> WordpieceTokenizerOperation::Build() { | |||
| std::shared_ptr<WordpieceTokenizerOp> tensor_op = std::make_shared<WordpieceTokenizerOp>( | |||
| vocab_, suffix_indicator_, max_bytes_per_token_, unknown_token_, with_offsets_); | |||
| return tensor_op; | |||
| } | |||
| #ifndef _WIN32 | |||
| // UnicodeScriptTokenizerOperation | |||
| UnicodeScriptTokenizerOperation::UnicodeScriptTokenizerOperation(bool keep_whitespace, bool with_offsets) | |||
| @@ -49,6 +49,7 @@ constexpr char kTruncateSequencePairOperation[] = "TruncateSequencePair"; | |||
| constexpr char kUnicodeCharTokenizerOperation[] = "UnicodeCharTokenizer"; | |||
| constexpr char kUnicodeScriptTokenizerOperation[] = "UnicodeScriptTokenizer"; | |||
| constexpr char kWhitespaceTokenizerOperation[] = "WhitespaceTokenizer"; | |||
| constexpr char kWordpieceTokenizerOperation[] = "WordpieceTokenizer"; | |||
| /* ####################################### Derived TensorOperation classes ################################# */ | |||
| @@ -318,6 +319,28 @@ class UnicodeCharTokenizerOperation : public TensorOperation { | |||
| bool with_offsets_; | |||
| }; | |||
| class WordpieceTokenizerOperation : public TensorOperation { | |||
| public: | |||
| explicit WordpieceTokenizerOperation(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator, | |||
| int32_t max_bytes_per_token, const std::string &unknown_token, | |||
| bool with_offsets); | |||
| ~WordpieceTokenizerOperation() = default; | |||
| std::shared_ptr<TensorOp> Build() override; | |||
| Status ValidateParams() override; | |||
| std::string Name() const override { return kWordpieceTokenizerOperation; } | |||
| private: | |||
| std::shared_ptr<Vocab> vocab_; | |||
| std::string suffix_indicator_; | |||
| int32_t max_bytes_per_token_; | |||
| std::string unknown_token_; | |||
| bool with_offsets_; | |||
| }; | |||
| #ifndef _WIN32 | |||
| class UnicodeScriptTokenizerOperation : public TensorOperation { | |||
| public: | |||
| @@ -2193,7 +2193,7 @@ def _pyfunc_worker_init(pyfunc_list): | |||
| # All exceptions will be raised to main processes | |||
| def _pyfunc_worker_exec(index, op_id, mapping, lock, record, *args): | |||
| """ | |||
| Internal function for call certain pyfunc in python process. | |||
| Internal function for call certain pyfunc in Python process. | |||
| """ | |||
| # Some threads in multiprocess.pool can't process sigint signal, | |||
| # and will occur hang problem, so ctrl+c will pass to parent process. | |||
| @@ -2366,7 +2366,7 @@ class MapDataset(Dataset): | |||
| # Pass #1, look for Python callables and build list | |||
| for op in self.operations: | |||
| # our c transforms is now callable and should not be run in python multithreading | |||
| # our c transforms is now callable and should not be run in Python multithreading | |||
| if callable(op) and str(op).find("c_transform") < 0: | |||
| callable_list.append(op) | |||
| @@ -2383,7 +2383,7 @@ class MapDataset(Dataset): | |||
| _op_process = _manager.dict() | |||
| _process_lock = _manager.Lock() | |||
| for op in self.operations: | |||
| # our c transforms is now callable and should not be run in python multithreading | |||
| # our c transforms is now callable and should not be run in Python multithreading | |||
| if callable(op) and str(op).find("c_transform") < 0: | |||
| # Wrap Python callable into _PythonCallable | |||
| iter_specific_operations.append(_PythonCallable(op, idx, op_id, _op_process, _process_lock, | |||
| @@ -609,7 +609,7 @@ class SubsetSampler(BuiltinSampler): | |||
| Samples the elements from a sequence of indices. | |||
| Args: | |||
| indices (Any iterable python object but string): A sequence of indices. | |||
| indices (Any iterable Python object but string): A sequence of indices. | |||
| num_samples (int, optional): Number of elements to sample (default=None, all elements). | |||
| Examples: | |||
| @@ -101,7 +101,7 @@ class JiebaTokenizer(TextTensorOperation): | |||
| - JiebaMode.MP, tokenize with MPSegment algorithm. | |||
| - JiebaMode.HMM, tokenize with Hiddel Markov Model Segment algorithm. | |||
| - JiebaMode.MIX, tokenize with a mix of MPSegment and HMMSegment algorithm. | |||
| with_offsets (bool, optional): If or not output offsets of tokens (default=False). | |||
| with_offsets (bool, optional): Whether or not output offsets of tokens (default=False). | |||
| Examples: | |||
| >>> from mindspore.dataset.text import JiebaMode | |||
| @@ -185,6 +185,9 @@ class JiebaTokenizer(TextTensorOperation): | |||
| word2 None | |||
| word3 freq3 | |||
| Only valid word-freq pairs in user provided file will be added into the dictionary. | |||
| Rows containing invalid input will be ignored. No error nor warning Status is returned. | |||
| Examples: | |||
| >>> from mindspore.dataset.text import JiebaMode | |||
| >>> jieba_hmm_file = "/path/to/jieba/hmm/file" | |||
| @@ -220,16 +223,16 @@ class JiebaTokenizer(TextTensorOperation): | |||
| "user dict file {} is not exist.".format(file_path)) | |||
| real_file_path = os.path.realpath(file_path) | |||
| file_dict = open(real_file_path) | |||
| data_re = re.compile('^(.+?)( [0-9]+)?$', re.U) | |||
| data_re = re.compile('^\\s*([^\\s*]+?)\\s*([0-9]+)?\\s*$', re.U) | |||
| words_list = [] | |||
| for item in file_dict: | |||
| data = item.strip() | |||
| if not isinstance(data, str): | |||
| data = self.__decode(data) | |||
| words = data_re.match(data).groups() | |||
| if len(words) != 2: | |||
| raise ValueError( | |||
| "user dict file {} format error.".format(real_file_path)) | |||
| tmp = data_re.match(data) | |||
| if not tmp: | |||
| continue | |||
| words = tmp.groups() | |||
| words_list.append(words) | |||
| file_dict.close() | |||
| return words_list | |||
| @@ -447,7 +450,7 @@ class UnicodeCharTokenizer(TextTensorOperation): | |||
| Tokenize a scalar tensor of UTF-8 string to Unicode characters. | |||
| Args: | |||
| with_offsets (bool, optional): If or not output offsets of tokens (default=False). | |||
| with_offsets (bool, optional): Whether or not output offsets of tokens (default=False). | |||
| Examples: | |||
| >>> # If with_offsets=False, default output one column {["text", dtype=str]} | |||
| @@ -469,8 +472,7 @@ class UnicodeCharTokenizer(TextTensorOperation): | |||
| return cde.UnicodeCharTokenizerOperation(self.with_offsets) | |||
| # TODO(alexyuyue): Need to decouple WordpieceTokenizerOp to WordpieceTokenizerOperation after it's supported in C++ | |||
| class WordpieceTokenizer(cde.WordpieceTokenizerOp): | |||
| class WordpieceTokenizer(TextTensorOperation): | |||
| """ | |||
| Tokenize scalar token or 1-D tokens to 1-D subword tokens. | |||
| @@ -480,7 +482,7 @@ class WordpieceTokenizer(cde.WordpieceTokenizerOp): | |||
| max_bytes_per_token (int, optional): Tokens exceeding this length will not be further split (default=100). | |||
| unknown_token (str, optional): When a token cannot be found: if 'unknown_token' is empty string, | |||
| return the token directly, else return 'unknown_token' (default='[UNK]'). | |||
| with_offsets (bool, optional): If or not output offsets of tokens (default=False). | |||
| with_offsets (bool, optional): Whether or not output offsets of tokens (default=False). | |||
| Examples: | |||
| >>> vocab_list = ["book", "cholera", "era", "favor", "##ite", "my", "is", "love", "dur", "##ing", "the"] | |||
| @@ -506,8 +508,10 @@ class WordpieceTokenizer(cde.WordpieceTokenizerOp): | |||
| self.max_bytes_per_token = max_bytes_per_token | |||
| self.unknown_token = unknown_token | |||
| self.with_offsets = with_offsets | |||
| super().__init__(self.vocab, self.suffix_indicator, self.max_bytes_per_token, | |||
| self.unknown_token, self.with_offsets) | |||
| def parse(self): | |||
| return cde.WordpieceTokenizerOperation(self.vocab, self.suffix_indicator, self.max_bytes_per_token, | |||
| self.unknown_token, self.with_offsets) | |||
| class PythonTokenizer: | |||
| @@ -561,7 +565,7 @@ if platform.system().lower() != 'windows': | |||
| only effective when `lower_case` is False. See NormalizeUTF8 for details (default=NormalizeForm.NONE). | |||
| preserve_unused_token (bool, optional): If True, do not split special tokens like | |||
| '[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]' (default=True). | |||
| with_offsets (bool, optional): If or not output offsets of tokens (default=False). | |||
| with_offsets (bool, optional): Whether or not output offsets of tokens (default=False). | |||
| Examples: | |||
| >>> from mindspore.dataset.text import NormalizeForm | |||
| @@ -627,7 +631,7 @@ if platform.system().lower() != 'windows': | |||
| only effective when `lower_case` is False. See NormalizeUTF8 for details (default=NormalizeForm.NONE). | |||
| preserve_unused_token (bool, optional): If True, do not split special tokens like | |||
| '[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]' (default=True). | |||
| with_offsets (bool, optional): If or not output offsets of tokens (default=False). | |||
| with_offsets (bool, optional): Whether or not output offsets of tokens (default=False). | |||
| Examples: | |||
| >>> from mindspore.dataset.text import NormalizeForm | |||
| @@ -782,7 +786,7 @@ if platform.system().lower() != 'windows': | |||
| keep_delim_pattern (str, optional): The string matched by 'delim_pattern' can be kept as a token | |||
| if it can be matched by 'keep_delim_pattern'. The default value is an empty str ('') | |||
| which means that delimiters will not be kept as an output token (default=''). | |||
| with_offsets (bool, optional): If or not output offsets of tokens (default=False). | |||
| with_offsets (bool, optional): Whether or not output offsets of tokens (default=False). | |||
| Examples: | |||
| >>> # If with_offsets=False, default output one column {["text", dtype=str]} | |||
| @@ -818,8 +822,8 @@ if platform.system().lower() != 'windows': | |||
| UnicodeScriptTokenizer is not supported on Windows platform yet. | |||
| Args: | |||
| keep_whitespace (bool, optional): If or not emit whitespace tokens (default=False). | |||
| with_offsets (bool, optional): If or not output offsets of tokens (default=False). | |||
| keep_whitespace (bool, optional): Whether or not emit whitespace tokens (default=False). | |||
| with_offsets (bool, optional): Whether or not output offsets of tokens (default=False). | |||
| Examples: | |||
| >>> # If with_offsets=False, default output one column {["text", dtype=str]} | |||
| @@ -854,7 +858,7 @@ if platform.system().lower() != 'windows': | |||
| WhitespaceTokenizer is not supported on Windows platform yet. | |||
| Args: | |||
| with_offsets (bool, optional): If or not output offsets of tokens (default=False). | |||
| with_offsets (bool, optional): Whether or not output offsets of tokens (default=False). | |||
| Examples: | |||
| >>> # If with_offsets=False, default output one column {["text", dtype=str]} | |||
| @@ -988,7 +988,7 @@ TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord) { | |||
| EXPECT_NE(jieba_tokenizer, nullptr); | |||
| // Add word with freq not provided (default 0) | |||
| jieba_tokenizer->AddWord("男默女泪"); | |||
| ASSERT_OK(jieba_tokenizer->AddWord("男默女泪")); | |||
| // Create Map operation on ds | |||
| ds = ds->Map({jieba_tokenizer}, {"text"}); | |||
| @@ -1038,7 +1038,7 @@ TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord1) { | |||
| EXPECT_NE(jieba_tokenizer, nullptr); | |||
| // Add word with freq is set explicitly to 0 | |||
| jieba_tokenizer->AddWord("男默女泪", 0); | |||
| ASSERT_OK(jieba_tokenizer->AddWord("男默女泪", 0)); | |||
| // Create Map operation on ds | |||
| ds = ds->Map({jieba_tokenizer}, {"text"}); | |||
| @@ -1088,7 +1088,7 @@ TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord2) { | |||
| EXPECT_NE(jieba_tokenizer, nullptr); | |||
| // Add word with freq 10 | |||
| jieba_tokenizer->AddWord("男默女泪", 10); | |||
| ASSERT_OK(jieba_tokenizer->AddWord("男默女泪", 10)); | |||
| // Create Map operation on ds | |||
| ds = ds->Map({jieba_tokenizer}, {"text"}); | |||
| @@ -1138,7 +1138,7 @@ TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord3) { | |||
| EXPECT_NE(jieba_tokenizer, nullptr); | |||
| // Add word with freq 20000 | |||
| jieba_tokenizer->AddWord("江大桥", 20000); | |||
| ASSERT_OK(jieba_tokenizer->AddWord("江大桥", 20000)); | |||
| // Create Map operation on ds | |||
| ds = ds->Map({jieba_tokenizer}, {"text"}); | |||
| @@ -1194,6 +1194,115 @@ TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWordFail) { | |||
| EXPECT_NE(jieba_tokenizer1->AddWord("我们", -1), Status::OK()); | |||
| } | |||
| TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddDict) { | |||
| // Testing AddDict of JiebaTokenizer when the input is a vector of word-freq pair. | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddDict."; | |||
| // Create a TextFile dataset | |||
| std::string data_file = datasets_root_path_ + "/testJiebaDataset/6.txt"; | |||
| std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8"; | |||
| std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8"; | |||
| std::shared_ptr<Dataset> ds = TextFile({data_file}); | |||
| EXPECT_NE(ds, nullptr); | |||
| // Create jieba_tokenizer operation on ds | |||
| std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer = | |||
| std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp); | |||
| EXPECT_NE(jieba_tokenizer, nullptr); | |||
| // Add word with freq 20000 | |||
| std::vector<std::pair<std::string, int64_t>> user_dict = {{"江大桥", 20000}}; | |||
| ASSERT_OK(jieba_tokenizer->AddDict(user_dict)); | |||
| // Create Map operation on ds | |||
| ds = ds->Map({jieba_tokenizer}, {"text"}); | |||
| EXPECT_NE(ds, nullptr); | |||
| // Create an iterator over the result of the above dataset | |||
| // This will trigger the creation of the Execution Tree and launch it. | |||
| std::shared_ptr<Iterator> iter = ds->CreateIterator(); | |||
| EXPECT_NE(iter, nullptr); | |||
| // Iterate the dataset and get each row | |||
| std::unordered_map<std::string, mindspore::MSTensor> row; | |||
| iter->GetNextRow(&row); | |||
| std::vector<std::string> expected = {"江州", "市长", "江大桥", "参加", "了", "长江大桥", "的", "通车", "仪式"}; | |||
| std::shared_ptr<Tensor> de_expected_tensor; | |||
| ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor)); | |||
| mindspore::MSTensor expected_tensor = | |||
| mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor)); | |||
| uint64_t i = 0; | |||
| while (row.size() != 0) { | |||
| auto txt = row["text"]; | |||
| EXPECT_MSTENSOR_EQ(txt, expected_tensor); | |||
| iter->GetNextRow(&row); | |||
| i++; | |||
| } | |||
| EXPECT_EQ(i, 1); | |||
| // Manually terminate the pipeline | |||
| iter->Stop(); | |||
| } | |||
| TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddDictFromFile) { | |||
| // Testing AddDict of JiebaTokenizer when the input is a path to dict. | |||
| // Test error scenario for AddDict: invalid path | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddDictFromFile."; | |||
| // Create a TextFile dataset | |||
| std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt"; | |||
| std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8"; | |||
| std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8"; | |||
| std::shared_ptr<Dataset> ds = TextFile({data_file}); | |||
| EXPECT_NE(ds, nullptr); | |||
| // Create jieba_tokenizer operation on ds | |||
| std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer = | |||
| std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp); | |||
| EXPECT_NE(jieba_tokenizer, nullptr); | |||
| // Load dict from txt file | |||
| std::string user_dict_path = datasets_root_path_ + "/testJiebaDataset/user_dict.txt"; | |||
| std::string invalid_path = datasets_root_path_ + "/testJiebaDataset/invalid_path.txt"; | |||
| EXPECT_ERROR(jieba_tokenizer->AddDict(invalid_path)); | |||
| ASSERT_OK(jieba_tokenizer->AddDict(user_dict_path)); | |||
| // Create Map operation on ds | |||
| ds = ds->Map({jieba_tokenizer}, {"text"}); | |||
| EXPECT_NE(ds, nullptr); | |||
| // Create an iterator over the result of the above dataset | |||
| // This will trigger the creation of the Execution Tree and launch it. | |||
| std::shared_ptr<Iterator> iter = ds->CreateIterator(); | |||
| EXPECT_NE(iter, nullptr); | |||
| // Iterate the dataset and get each row | |||
| std::unordered_map<std::string, mindspore::MSTensor> row; | |||
| iter->GetNextRow(&row); | |||
| std::vector<std::string> expected = {"今天天气", "太好了", "我们", "一起", "去", "外面", "玩吧"}; | |||
| std::shared_ptr<Tensor> de_expected_tensor; | |||
| ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor)); | |||
| mindspore::MSTensor expected_tensor = | |||
| mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor)); | |||
| uint64_t i = 0; | |||
| while (row.size() != 0) { | |||
| auto txt = row["text"]; | |||
| EXPECT_MSTENSOR_EQ(txt, expected_tensor); | |||
| iter->GetNextRow(&row); | |||
| i++; | |||
| } | |||
| EXPECT_EQ(i, 1); | |||
| // Manually terminate the pipeline | |||
| iter->Stop(); | |||
| } | |||
| TEST_F(MindDataTestPipeline, TestSlidingWindowSuccess) { | |||
| // Testing the parameter of SlidingWindow interface when the axis is 0. | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSlidingWindowSuccess."; | |||
| @@ -2523,6 +2632,421 @@ TEST_F(MindDataTestPipeline, TestUnicodeCharTokenizerSuccess1) { | |||
| iter->Stop(); | |||
| } | |||
| std::vector<std::string> vocab_english = {"book", "cholera", "era", "favor", "##ite", "my", | |||
| "is", "love", "dur", "##ing", "the"}; | |||
| std::vector<std::string> vocab_chinese = {"我", "最", "喜", "欢", "的", "书", "是", "霍", "乱", "时", "期", "爱", "情"}; | |||
| TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess1) { | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess1."; | |||
| // Test WordpieceTokenizer with default parameters on English vocab | |||
| // Create a TextFile dataset | |||
| std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt"; | |||
| std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); | |||
| EXPECT_NE(ds, nullptr); | |||
| // Create Take operation on ds | |||
| ds = ds->Take(10); | |||
| EXPECT_NE(ds, nullptr); | |||
| // Create a vocab from vector | |||
| std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); | |||
| Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab); | |||
| EXPECT_EQ(s, Status::OK()); | |||
| // Create WordpieceTokenizer operation on ds | |||
| std::shared_ptr<TensorTransform> wordpiece_tokenizer = std::make_shared<text::WordpieceTokenizer>(vocab); | |||
| EXPECT_NE(wordpiece_tokenizer, nullptr); | |||
| // Create Map operation on ds | |||
| ds = ds->Map({wordpiece_tokenizer}, {"text"}); | |||
| EXPECT_NE(ds, nullptr); | |||
| // Create an iterator over the result of the above dataset | |||
| // This will trigger the creation of the Execution Tree and launch it. | |||
| std::shared_ptr<Iterator> iter = ds->CreateIterator(); | |||
| EXPECT_NE(iter, nullptr); | |||
| // Iterate the dataset and get each row | |||
| std::unordered_map<std::string, mindspore::MSTensor> row; | |||
| iter->GetNextRow(&row); | |||
| std::vector<std::vector<std::string>> expected = { | |||
| {"my"}, {"favor", "##ite"}, {"book"}, {"is"}, {"love"}, {"dur", "##ing"}, {"the"}, {"cholera"}, {"era"}, {"[UNK]"}}; | |||
| uint64_t i = 0; | |||
| while (row.size() != 0) { | |||
| auto txt = row["text"]; | |||
| std::shared_ptr<Tensor> de_expected_tensor; | |||
| ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor)); | |||
| mindspore::MSTensor expected_tensor = | |||
| mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor)); | |||
| EXPECT_MSTENSOR_EQ(txt, expected_tensor); | |||
| iter->GetNextRow(&row); | |||
| i++; | |||
| } | |||
| EXPECT_EQ(i, 10); | |||
| // Manually terminate the pipeline | |||
| iter->Stop(); | |||
| } | |||
| TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess2) { | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess2."; | |||
| // Test WordpieceTokenizer with empty unknown_token | |||
| // Create a TextFile dataset | |||
| std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt"; | |||
| std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); | |||
| EXPECT_NE(ds, nullptr); | |||
| // Create Take operation on ds | |||
| ds = ds->Take(10); | |||
| EXPECT_NE(ds, nullptr); | |||
| // Create a vocab from vector | |||
| std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); | |||
| Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab); | |||
| EXPECT_EQ(s, Status::OK()); | |||
| // Create WordpieceTokenizer operation on ds | |||
| std::shared_ptr<TensorTransform> wordpiece_tokenizer = | |||
| std::make_shared<text::WordpieceTokenizer>(vocab, "##", 100, "", false); | |||
| EXPECT_NE(wordpiece_tokenizer, nullptr); | |||
| // Create Map operation on ds | |||
| ds = ds->Map({wordpiece_tokenizer}, {"text"}); | |||
| EXPECT_NE(ds, nullptr); | |||
| // Create an iterator over the result of the above dataset | |||
| // This will trigger the creation of the Execution Tree and launch it. | |||
| std::shared_ptr<Iterator> iter = ds->CreateIterator(); | |||
| EXPECT_NE(iter, nullptr); | |||
| // Iterate the dataset and get each row | |||
| std::unordered_map<std::string, mindspore::MSTensor> row; | |||
| iter->GetNextRow(&row); | |||
| std::vector<std::vector<std::string>> expected = { | |||
| {"my"}, {"favor", "##ite"}, {"book"}, {"is"}, {"love"}, {"dur", "##ing"}, {"the"}, {"cholera"}, {"era"}, {"what"}}; | |||
| uint64_t i = 0; | |||
| while (row.size() != 0) { | |||
| auto txt = row["text"]; | |||
| std::shared_ptr<Tensor> de_expected_tensor; | |||
| ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor)); | |||
| mindspore::MSTensor expected_tensor = | |||
| mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor)); | |||
| EXPECT_MSTENSOR_EQ(txt, expected_tensor); | |||
| iter->GetNextRow(&row); | |||
| i++; | |||
| } | |||
| EXPECT_EQ(i, 10); | |||
| // Manually terminate the pipeline | |||
| iter->Stop(); | |||
| } | |||
| TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess3) { | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess3."; | |||
| // Test WordpieceTokenizer with non-default max_bytes_per_token | |||
| // Create a TextFile dataset | |||
| std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt"; | |||
| std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); | |||
| EXPECT_NE(ds, nullptr); | |||
| // Create Take operation on ds | |||
| ds = ds->Take(10); | |||
| EXPECT_NE(ds, nullptr); | |||
| // Create a vocab from vector | |||
| std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); | |||
| Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab); | |||
| EXPECT_EQ(s, Status::OK()); | |||
| // Create WordpieceTokenizer operation on ds | |||
| std::shared_ptr<TensorTransform> wordpiece_tokenizer = | |||
| std::make_shared<text::WordpieceTokenizer>(vocab, "##", 4, "[UNK]", false); | |||
| EXPECT_NE(wordpiece_tokenizer, nullptr); | |||
| // Create Map operation on ds | |||
| ds = ds->Map({wordpiece_tokenizer}, {"text"}); | |||
| EXPECT_NE(ds, nullptr); | |||
| // Create an iterator over the result of the above dataset | |||
| // This will trigger the creation of the Execution Tree and launch it. | |||
| std::shared_ptr<Iterator> iter = ds->CreateIterator(); | |||
| EXPECT_NE(iter, nullptr); | |||
| // Iterate the dataset and get each row | |||
| std::unordered_map<std::string, mindspore::MSTensor> row; | |||
| iter->GetNextRow(&row); | |||
| std::vector<std::vector<std::string>> expected = {{"my"}, {"[UNK]"}, {"book"}, {"is"}, {"love"}, | |||
| {"[UNK]"}, {"the"}, {"[UNK]"}, {"era"}, {"[UNK]"}}; | |||
| uint64_t i = 0; | |||
| while (row.size() != 0) { | |||
| auto txt = row["text"]; | |||
| std::shared_ptr<Tensor> de_expected_tensor; | |||
| ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor)); | |||
| mindspore::MSTensor expected_tensor = | |||
| mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor)); | |||
| EXPECT_MSTENSOR_EQ(txt, expected_tensor); | |||
| iter->GetNextRow(&row); | |||
| i++; | |||
| } | |||
| EXPECT_EQ(i, 10); | |||
| // Manually terminate the pipeline | |||
| iter->Stop(); | |||
| } | |||
| TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess4) { | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess4."; | |||
| // Test WordpieceTokenizer with default parameters on Chinese vocab | |||
| // Create a TextFile dataset | |||
| std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt"; | |||
| std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); | |||
| EXPECT_NE(ds, nullptr); | |||
| // Create Skip operation on ds | |||
| ds = ds->Skip(10); | |||
| EXPECT_NE(ds, nullptr); | |||
| // Create Take operation on ds | |||
| ds = ds->Take(15); | |||
| EXPECT_NE(ds, nullptr); | |||
| // Create a vocab from vector | |||
| std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); | |||
| Status s = Vocab::BuildFromVector(vocab_chinese, {}, true, &vocab); | |||
| EXPECT_EQ(s, Status::OK()); | |||
| // Create WordpieceTokenizer operation on ds | |||
| std::shared_ptr<TensorTransform> wordpiece_tokenizer = | |||
| std::make_shared<text::WordpieceTokenizer>(vocab, "##", 100, "[UNK]", false); | |||
| EXPECT_NE(wordpiece_tokenizer, nullptr); | |||
| // Create Map operation on ds | |||
| ds = ds->Map({wordpiece_tokenizer}, {"text"}); | |||
| EXPECT_NE(ds, nullptr); | |||
| // Create an iterator over the result of the above dataset | |||
| // This will trigger the creation of the Execution Tree and launch it. | |||
| std::shared_ptr<Iterator> iter = ds->CreateIterator(); | |||
| EXPECT_NE(iter, nullptr); | |||
| // Iterate the dataset and get each row | |||
| std::unordered_map<std::string, mindspore::MSTensor> row; | |||
| iter->GetNextRow(&row); | |||
| std::vector<std::vector<std::string>> expected = {{"我"}, {"最"}, {"喜"}, {"欢"}, {"的"}, {"书"}, {"是"}, {"霍"}, | |||
| {"乱"}, {"时"}, {"期"}, {"的"}, {"爱"}, {"情"}, {"[UNK]"}}; | |||
| uint64_t i = 0; | |||
| while (row.size() != 0) { | |||
| auto txt = row["text"]; | |||
| std::shared_ptr<Tensor> de_expected_tensor; | |||
| ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor)); | |||
| mindspore::MSTensor expected_tensor = | |||
| mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor)); | |||
| EXPECT_MSTENSOR_EQ(txt, expected_tensor); | |||
| iter->GetNextRow(&row); | |||
| i++; | |||
| } | |||
| EXPECT_EQ(i, 15); | |||
| // Manually terminate the pipeline | |||
| iter->Stop(); | |||
| } | |||
| TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess5) { | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess5."; | |||
| // Test WordpieceTokenizer with with_offsets true | |||
| // Create a TextFile dataset | |||
| std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt"; | |||
| std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); | |||
| EXPECT_NE(ds, nullptr); | |||
| // Create Take operation on ds | |||
| ds = ds->Take(10); | |||
| EXPECT_NE(ds, nullptr); | |||
| // Create a vocab from vector | |||
| std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); | |||
| Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab); | |||
| EXPECT_EQ(s, Status::OK()); | |||
| // Create WordpieceTokenizer operation on ds | |||
| std::shared_ptr<TensorTransform> wordpiece_tokenizer = | |||
| std::make_shared<text::WordpieceTokenizer>(vocab, "##", 100, "[UNK]", true); | |||
| EXPECT_NE(wordpiece_tokenizer, nullptr); | |||
| // Create Map operation on ds | |||
| ds = ds->Map({wordpiece_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"}); | |||
| EXPECT_NE(ds, nullptr); | |||
| // Create an iterator over the result of the above dataset | |||
| // This will trigger the creation of the Execution Tree and launch it. | |||
| std::shared_ptr<Iterator> iter = ds->CreateIterator(); | |||
| EXPECT_NE(iter, nullptr); | |||
| // Iterate the dataset and get each row | |||
| std::unordered_map<std::string, mindspore::MSTensor> row; | |||
| iter->GetNextRow(&row); | |||
| std::vector<std::vector<std::string>> expected = { | |||
| {"my"}, {"favor", "##ite"}, {"book"}, {"is"}, {"love"}, {"dur", "##ing"}, {"the"}, {"cholera"}, {"era"}, {"[UNK]"}}; | |||
| std::vector<std::vector<uint32_t>> expected_offsets_start = {{0}, {0, 5}, {0}, {0}, {0}, {0, 3}, {0}, {0}, {0}, {0}}; | |||
| std::vector<std::vector<uint32_t>> expected_offsets_limit = {{2}, {5, 8}, {4}, {2}, {4}, {3, 6}, {3}, {7}, {3}, {4}}; | |||
| uint64_t i = 0; | |||
| while (row.size() != 0) { | |||
| auto txt = row["token"]; | |||
| std::shared_ptr<Tensor> de_expected_tensor; | |||
| ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor)); | |||
| mindspore::MSTensor expected_tensor = | |||
| mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor)); | |||
| EXPECT_MSTENSOR_EQ(txt, expected_tensor); | |||
| auto start = row["offsets_start"]; | |||
| std::shared_ptr<Tensor> de_expected_start_tensor; | |||
| ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start[i], &de_expected_start_tensor)); | |||
| mindspore::MSTensor expected_start_tensor = | |||
| mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_start_tensor)); | |||
| EXPECT_MSTENSOR_EQ(start, expected_start_tensor); | |||
| auto limit = row["offsets_limit"]; | |||
| std::shared_ptr<Tensor> de_expected_limit_tensor; | |||
| ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit[i], &de_expected_limit_tensor)); | |||
| mindspore::MSTensor expected_limit_tensor = | |||
| mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_limit_tensor)); | |||
| EXPECT_MSTENSOR_EQ(limit, expected_limit_tensor); | |||
| iter->GetNextRow(&row); | |||
| i++; | |||
| } | |||
| EXPECT_EQ(i, 10); | |||
| // Manually terminate the pipeline | |||
| iter->Stop(); | |||
| } | |||
| TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess6) { | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess6."; | |||
| // Test WordpieceTokenizer with max_bytes_per_token equals to 0 | |||
| // Create a TextFile dataset | |||
| std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt"; | |||
| std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); | |||
| EXPECT_NE(ds, nullptr); | |||
| // Create Take operation on ds | |||
| ds = ds->Take(10); | |||
| EXPECT_NE(ds, nullptr); | |||
| // Create a vocab from vector | |||
| std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); | |||
| Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab); | |||
| EXPECT_EQ(s, Status::OK()); | |||
| // Create WordpieceTokenizer operation on ds | |||
| std::shared_ptr<TensorTransform> wordpiece_tokenizer = | |||
| std::make_shared<text::WordpieceTokenizer>(vocab, "##", 0, "[UNK]", true); | |||
| EXPECT_NE(wordpiece_tokenizer, nullptr); | |||
| // Create Map operation on ds | |||
| ds = ds->Map({wordpiece_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"}); | |||
| EXPECT_NE(ds, nullptr); | |||
| // Create an iterator over the result of the above dataset | |||
| // This will trigger the creation of the Execution Tree and launch it. | |||
| std::shared_ptr<Iterator> iter = ds->CreateIterator(); | |||
| EXPECT_NE(iter, nullptr); | |||
| // Iterate the dataset and get each row | |||
| std::unordered_map<std::string, mindspore::MSTensor> row; | |||
| iter->GetNextRow(&row); | |||
| std::vector<std::vector<std::string>> expected = {{"[UNK]"}, {"[UNK]"}, {"[UNK]"}, {"[UNK]"}, {"[UNK]"}, | |||
| {"[UNK]"}, {"[UNK]"}, {"[UNK]"}, {"[UNK]"}, {"[UNK]"}}; | |||
| uint64_t i = 0; | |||
| while (row.size() != 0) { | |||
| auto txt = row["token"]; | |||
| std::shared_ptr<Tensor> de_expected_tensor; | |||
| ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor)); | |||
| mindspore::MSTensor expected_tensor = | |||
| mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor)); | |||
| EXPECT_MSTENSOR_EQ(txt, expected_tensor); | |||
| iter->GetNextRow(&row); | |||
| i++; | |||
| } | |||
| EXPECT_EQ(i, 10); | |||
| // Manually terminate the pipeline | |||
| iter->Stop(); | |||
| } | |||
| TEST_F(MindDataTestPipeline, TestWordpieceTokenizerFail1) { | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerFail1."; | |||
| // Test WordpieceTokenizer with nullptr vocab | |||
| // Create a TextFile dataset | |||
| std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt"; | |||
| std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); | |||
| EXPECT_NE(ds, nullptr); | |||
| // Create WordpieceTokenizer operation on ds | |||
| std::shared_ptr<TensorTransform> wordpiece_tokenizer = std::make_shared<text::WordpieceTokenizer>(nullptr); | |||
| EXPECT_NE(wordpiece_tokenizer, nullptr); | |||
| // Create a Map operation on ds | |||
| ds = ds->Map({wordpiece_tokenizer}); | |||
| EXPECT_NE(ds, nullptr); | |||
| std::shared_ptr<Iterator> iter = ds->CreateIterator(); | |||
| // Expect failure: invalid WordpieceTokenizer input with nullptr vocab | |||
| EXPECT_EQ(iter, nullptr); | |||
| } | |||
| TEST_F(MindDataTestPipeline, TestWordpieceTokenizerFail2) { | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerFail2."; | |||
| // Test WordpieceTokenizer with negative max_bytes_per_token | |||
| // Create a TextFile dataset | |||
| std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt"; | |||
| std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); | |||
| EXPECT_NE(ds, nullptr); | |||
| // Create a vocab from vector | |||
| std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); | |||
| Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab); | |||
| EXPECT_EQ(s, Status::OK()); | |||
| // Create WordpieceTokenizer operation on ds | |||
| std::shared_ptr<TensorTransform> wordpiece_tokenizer = std::make_shared<text::WordpieceTokenizer>(vocab, "##", -1); | |||
| EXPECT_NE(wordpiece_tokenizer, nullptr); | |||
| // Create a Map operation on ds | |||
| ds = ds->Map({wordpiece_tokenizer}); | |||
| EXPECT_NE(ds, nullptr); | |||
| std::shared_ptr<Iterator> iter = ds->CreateIterator(); | |||
| // Expect failure: invalid WordpieceTokenizer input with nullptr vocab | |||
| EXPECT_EQ(iter, nullptr); | |||
| } | |||
| TEST_F(MindDataTestPipeline, TestUnicodeScriptTokenizerSuccess) { | |||
| // Testing the parameter of UnicodeScriptTokenizer interface when the with_offsets and the keep_whitespace is default. | |||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeScriptTokenizerSuccess."; | |||