Signed-off-by: alex-yuyue <yue.yu1@huawei.com>pull/13364/head
| @@ -134,6 +134,15 @@ inline std::vector<std::pair<std::string, std::vector<int32_t>>> ClassIndexCharT | |||||
| return ret; | return ret; | ||||
| } | } | ||||
| inline std::vector<std::pair<std::vector<char>, int64_t>> PairStringInt64ToPairCharInt64( | |||||
| const std::vector<std::pair<std::string, int64_t>> &s) { | |||||
| std::vector<std::pair<std::vector<char>, int64_t>> ret; | |||||
| std::transform(s.begin(), s.end(), std::back_inserter(ret), [](auto str) { | |||||
| return std::pair<std::vector<char>, int64_t>(std::vector<char>(str.first.begin(), str.first.end()), str.second); | |||||
| }); | |||||
| return ret; | |||||
| } | |||||
| template <class T> | template <class T> | ||||
| inline std::map<std::vector<char>, T> PadInfoStringToChar(const std::map<std::string, T> &s_pad_info) { | inline std::map<std::vector<char>, T> PadInfoStringToChar(const std::map<std::string, T> &s_pad_info) { | ||||
| std::map<std::vector<char>, T> ret; | std::map<std::vector<char>, T> ret; | ||||
| @@ -232,12 +232,17 @@ PYBIND_REGISTER(UnicodeCharTokenizerOperation, 1, ([](const py::module *m) { | |||||
| })); | })); | ||||
| })); | })); | ||||
| // TODO(alexyuyue): Need to decouple WordpieceTokenizerOp to WordpieceTokenizerOperation after it's supported in C++ | |||||
| PYBIND_REGISTER(WordpieceTokenizerOp, 1, ([](const py::module *m) { | |||||
| (void)py::class_<WordpieceTokenizerOp, TensorOp, std::shared_ptr<WordpieceTokenizerOp>>( | |||||
| *m, "WordpieceTokenizerOp") | |||||
| .def(py::init<const std::shared_ptr<Vocab> &, const std::string &, const int &, const std::string &, | |||||
| const bool &>()); | |||||
| PYBIND_REGISTER(WordpieceTokenizerOperation, 1, ([](const py::module *m) { | |||||
| (void)py::class_<text::WordpieceTokenizerOperation, TensorOperation, | |||||
| std::shared_ptr<text::WordpieceTokenizerOperation>>(*m, | |||||
| "WordpieceTokenizerOperation") | |||||
| .def(py::init([](const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator, | |||||
| int32_t max_bytes_per_token, const std::string &unknown_token, bool with_offsets) { | |||||
| auto wordpiece_tokenizer = std::make_shared<text::WordpieceTokenizerOperation>( | |||||
| vocab, suffix_indicator, max_bytes_per_token, unknown_token, with_offsets); | |||||
| THROW_IF_ERROR(wordpiece_tokenizer->ValidateParams()); | |||||
| return wordpiece_tokenizer; | |||||
| })); | |||||
| })); | })); | ||||
| PYBIND_REGISTER(JiebaMode, 0, ([](const py::module *m) { | PYBIND_REGISTER(JiebaMode, 0, ([](const py::module *m) { | ||||
| @@ -15,6 +15,8 @@ | |||||
| */ | */ | ||||
| #include <unistd.h> | #include <unistd.h> | ||||
| #include <fstream> | |||||
| #include <regex> | |||||
| #include "minddata/dataset/include/text.h" | #include "minddata/dataset/include/text.h" | ||||
| @@ -131,7 +133,7 @@ std::shared_ptr<TensorOperation> JiebaTokenizer::Parse() { | |||||
| return jieba_tokenizer; | return jieba_tokenizer; | ||||
| } | } | ||||
| Status JiebaTokenizer::AddWord(const std::string &word, int64_t freq) { | |||||
| Status JiebaTokenizer::AddWordChar(const std::vector<char> &word, int64_t freq) { | |||||
| if (word.empty()) { | if (word.empty()) { | ||||
| std::string err_msg = "JiebaTokenizer : The parameter word is empty or not provided."; | std::string err_msg = "JiebaTokenizer : The parameter word is empty or not provided."; | ||||
| MS_LOG(ERROR) << err_msg; | MS_LOG(ERROR) << err_msg; | ||||
| @@ -142,7 +144,59 @@ Status JiebaTokenizer::AddWord(const std::string &word, int64_t freq) { | |||||
| MS_LOG(ERROR) << err_msg; | MS_LOG(ERROR) << err_msg; | ||||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | RETURN_STATUS_SYNTAX_ERROR(err_msg); | ||||
| } | } | ||||
| data_->words_list_.emplace_back(word, freq); | |||||
| data_->words_list_.emplace_back(CharToString(word), freq); | |||||
| return Status::OK(); | |||||
| } | |||||
| Status JiebaTokenizer::AddDictChar(const std::vector<std::pair<std::vector<char>, int64_t>> &user_dict) { | |||||
| for (auto &word_freq_pair : user_dict) { | |||||
| RETURN_IF_NOT_OK(AddWordChar(word_freq_pair.first, word_freq_pair.second)); | |||||
| } | |||||
| return Status::OK(); | |||||
| } | |||||
| Status JiebaTokenizer::AddDictChar(const std::vector<char> &file_path) { | |||||
| std::vector<std::pair<std::string, int64_t>> user_dict; | |||||
| RETURN_IF_NOT_OK(ParserFile(CharToString(file_path), &user_dict)); | |||||
| RETURN_IF_NOT_OK(AddDictChar(PairStringInt64ToPairCharInt64(user_dict))); | |||||
| return Status::OK(); | |||||
| } | |||||
| Status JiebaTokenizer::ParserFile(const std::string &file_path, | |||||
| std::vector<std::pair<std::string, int64_t>> *const user_dict) { | |||||
| std::ifstream ifs(file_path); | |||||
| if (!ifs) { | |||||
| std::string err_msg = "JiebaTokenizer : Fail to load dictionary from the input file, check the file path."; | |||||
| MS_LOG(ERROR) << err_msg; | |||||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||||
| } | |||||
| std::string line; | |||||
| while (std::getline(ifs, line)) { | |||||
| if (line.empty()) { | |||||
| continue; | |||||
| } | |||||
| std::regex regex("^\\s*([^\\s*]+?)\\s*([0-9]+)?\\s*$"); | |||||
| std::smatch tokens; | |||||
| std::regex_match(line, tokens, regex); | |||||
| if (std::regex_match(line, tokens, regex)) { | |||||
| if (tokens.size() == 2) { | |||||
| user_dict->emplace_back(tokens.str(1), 0); | |||||
| } else if (tokens.size() == 3) { | |||||
| user_dict->emplace_back(tokens.str(1), strtoll(tokens.str(2).c_str(), NULL, 0)); | |||||
| } else { | |||||
| continue; | |||||
| } | |||||
| } else { | |||||
| continue; | |||||
| } | |||||
| } | |||||
| MS_LOG(INFO) << "JiebaTokenizer::AddDict: The size of user input dictionary is: " << user_dict->size(); | |||||
| MS_LOG(INFO) << "Valid rows in input dictionary (Maximum of first 10 rows are shown.):"; | |||||
| for (std::size_t i = 0; i != user_dict->size(); ++i) { | |||||
| if (i >= 10) break; | |||||
| MS_LOG(INFO) << user_dict->at(i).first << " " << user_dict->at(i).second; | |||||
| } | |||||
| return Status::OK(); | return Status::OK(); | ||||
| } | } | ||||
| @@ -310,6 +364,32 @@ std::shared_ptr<TensorOperation> UnicodeCharTokenizer::Parse() { | |||||
| return std::make_shared<UnicodeCharTokenizerOperation>(data_->with_offsets_); | return std::make_shared<UnicodeCharTokenizerOperation>(data_->with_offsets_); | ||||
| } | } | ||||
| // WordpieceTokenizer | |||||
| struct WordpieceTokenizer::Data { | |||||
| Data(const std::shared_ptr<Vocab> &vocab, const std::vector<char> &suffix_indicator, int32_t max_bytes_per_token, | |||||
| const std::vector<char> &unknown_token, bool with_offsets) | |||||
| : vocab_(vocab), | |||||
| suffix_indicator_(CharToString(suffix_indicator)), | |||||
| max_bytes_per_token_(max_bytes_per_token), | |||||
| unknown_token_(CharToString(unknown_token)), | |||||
| with_offsets_(with_offsets) {} | |||||
| std::shared_ptr<Vocab> vocab_; | |||||
| std::string suffix_indicator_; | |||||
| int32_t max_bytes_per_token_; | |||||
| std::string unknown_token_; | |||||
| bool with_offsets_; | |||||
| }; | |||||
| WordpieceTokenizer::WordpieceTokenizer(const std::shared_ptr<Vocab> &vocab, const std::vector<char> &suffix_indicator, | |||||
| int32_t max_bytes_per_token, const std::vector<char> &unknown_token, | |||||
| bool with_offsets) | |||||
| : data_(std::make_shared<Data>(vocab, suffix_indicator, max_bytes_per_token, unknown_token, with_offsets)) {} | |||||
| std::shared_ptr<TensorOperation> WordpieceTokenizer::Parse() { | |||||
| return std::make_shared<WordpieceTokenizerOperation>( | |||||
| data_->vocab_, data_->suffix_indicator_, data_->max_bytes_per_token_, data_->unknown_token_, data_->with_offsets_); | |||||
| } | |||||
| #ifndef _WIN32 | #ifndef _WIN32 | ||||
| // UnicodeScriptTokenizer | // UnicodeScriptTokenizer | ||||
| struct UnicodeScriptTokenizer::Data { | struct UnicodeScriptTokenizer::Data { | ||||
| @@ -52,7 +52,7 @@ class BasicTokenizer final : public TensorTransform { | |||||
| /// false. See NormalizeUTF8 for details (default=NormalizeForm::kNone). | /// false. See NormalizeUTF8 for details (default=NormalizeForm::kNone). | ||||
| /// \param[in] preserve_unused_token If true, do not split special tokens like '[CLS]', '[SEP]', '[UNK]', '[PAD]', | /// \param[in] preserve_unused_token If true, do not split special tokens like '[CLS]', '[SEP]', '[UNK]', '[PAD]', | ||||
| /// '[MASK]' (default=true). | /// '[MASK]' (default=true). | ||||
| /// \param[in] with_offsets If or not output offsets of tokens (default=false). | |||||
| /// \param[in] with_offsets Whether or not output offsets of tokens (default=false). | |||||
| explicit BasicTokenizer(bool lower_case = false, bool keep_whitespace = false, | explicit BasicTokenizer(bool lower_case = false, bool keep_whitespace = false, | ||||
| const NormalizeForm normalize_form = NormalizeForm::kNone, bool preserve_unused_token = true, | const NormalizeForm normalize_form = NormalizeForm::kNone, bool preserve_unused_token = true, | ||||
| bool with_offsets = false); | bool with_offsets = false); | ||||
| @@ -88,7 +88,7 @@ class BertTokenizer final : public TensorTransform { | |||||
| /// false. See NormalizeUTF8 for details (default=NormalizeForm::kNone). | /// false. See NormalizeUTF8 for details (default=NormalizeForm::kNone). | ||||
| /// \param[in] preserve_unused_token If true, do not split special tokens like '[CLS]', '[SEP]', '[UNK]', '[PAD]', | /// \param[in] preserve_unused_token If true, do not split special tokens like '[CLS]', '[SEP]', '[UNK]', '[PAD]', | ||||
| /// '[MASK]' (default=true). | /// '[MASK]' (default=true). | ||||
| /// \param[in] with_offsets If or not output offsets of tokens (default=false). | |||||
| /// \param[in] with_offsets Whether or not output offsets of tokens (default=false). | |||||
| explicit BertTokenizer(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator = "##", | explicit BertTokenizer(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator = "##", | ||||
| int32_t max_bytes_per_token = 100, const std::string &unknown_token = "[UNK]", | int32_t max_bytes_per_token = 100, const std::string &unknown_token = "[UNK]", | ||||
| bool lower_case = false, bool keep_whitespace = false, | bool lower_case = false, bool keep_whitespace = false, | ||||
| @@ -145,7 +145,7 @@ class JiebaTokenizer final : public TensorTransform { | |||||
| /// - JiebaMode.kMP, tokenize with MPSegment algorithm. | /// - JiebaMode.kMP, tokenize with MPSegment algorithm. | ||||
| /// - JiebaMode.kHMM, tokenize with Hiddel Markov Model Segment algorithm. | /// - JiebaMode.kHMM, tokenize with Hiddel Markov Model Segment algorithm. | ||||
| /// - JiebaMode.kMIX, tokenize with a mix of MPSegment and HMMSegment algorithm. | /// - JiebaMode.kMIX, tokenize with a mix of MPSegment and HMMSegment algorithm. | ||||
| /// \param[in] with_offsets If or not output offsets of tokens (default=false). | |||||
| /// \param[in] with_offsets Whether or not output offsets of tokens (default=false). | |||||
| explicit JiebaTokenizer(const std::string &hmm_path, const std::string &mp_path, | explicit JiebaTokenizer(const std::string &hmm_path, const std::string &mp_path, | ||||
| const JiebaMode &mode = JiebaMode::kMix, bool with_offsets = false) | const JiebaMode &mode = JiebaMode::kMix, bool with_offsets = false) | ||||
| : JiebaTokenizer(StringToChar(hmm_path), StringToChar(mp_path), mode, with_offsets) {} | : JiebaTokenizer(StringToChar(hmm_path), StringToChar(mp_path), mode, with_offsets) {} | ||||
| @@ -156,7 +156,24 @@ class JiebaTokenizer final : public TensorTransform { | |||||
| /// \brief Destructor | /// \brief Destructor | ||||
| ~JiebaTokenizer() = default; | ~JiebaTokenizer() = default; | ||||
| Status AddWord(const std::string &word, int64_t freq = 0); | |||||
| /// \brief Add user defined word to JiebaTokenizer's dictionary. | |||||
| /// \param[in] word The word to be added to the JiebaTokenizer instance. | |||||
| /// The added word will not be written into the built-in dictionary on disk. | |||||
| /// \param[in] freq The frequency of the word to be added. The higher the frequency, | |||||
| /// the better chance the word will be tokenized (default=None, use default frequency). | |||||
| Status AddWord(const std::string &word, int64_t freq = 0) { return AddWordChar(StringToChar(word), freq); } | |||||
| /// \brief Add user defined dictionary of word-freq pairs to JiebaTokenizer's dictionary. | |||||
| /// \param[in] user_dict Vector of word-freq pairs to be added to JiebaTokenizer's dictionary. | |||||
| Status AddDict(const std::vector<std::pair<std::string, int64_t>> &user_dict) { | |||||
| return AddDictChar(PairStringInt64ToPairCharInt64(user_dict)); | |||||
| } | |||||
| /// \brief Add user defined dictionary of word-freq pairs to JiebaTokenizer's dictionary from a file. | |||||
| /// Only valid word-freq pairs in user provided file will be added into the dictionary. | |||||
| /// Rows containing invalid input will be ignored, no error nor warning Status is returned. | |||||
| /// \param[in] file_path Path to the dictionary which includes user defined word-freq pairs. | |||||
| Status AddDict(const std::string &file_path) { return AddDictChar(StringToChar(file_path)); } | |||||
| protected: | protected: | ||||
| /// \brief Function to convert TensorTransform object into a TensorOperation object. | /// \brief Function to convert TensorTransform object into a TensorOperation object. | ||||
| @@ -164,6 +181,20 @@ class JiebaTokenizer final : public TensorTransform { | |||||
| std::shared_ptr<TensorOperation> Parse() override; | std::shared_ptr<TensorOperation> Parse() override; | ||||
| private: | private: | ||||
| /// \brief Parser user defined word by file. | |||||
| /// \param[in] file_path Path to the user defined file. | |||||
| /// \param[in] user_dict Vector of word-freq pairs extracted from the user provided file. | |||||
| Status ParserFile(const std::string &file_path, std::vector<std::pair<std::string, int64_t>> *const user_dict); | |||||
| /// \brief Used to translate all API string to vector of char and back | |||||
| Status AddWordChar(const std::vector<char> &word, int64_t freq = 0); | |||||
| /// \brief Used to translate all API string to vector of char and back | |||||
| Status AddDictChar(const std::vector<std::pair<std::vector<char>, int64_t>> &user_dict); | |||||
| /// \brief Used to translate all API string to vector of char and back | |||||
| Status AddDictChar(const std::vector<char> &file_path); | |||||
| struct Data; | struct Data; | ||||
| std::shared_ptr<Data> data_; | std::shared_ptr<Data> data_; | ||||
| }; | }; | ||||
| @@ -292,7 +323,7 @@ class RegexTokenizer final : public TensorTransform { | |||||
| /// \param[in] keep_delim_pattern The string matched by 'delim_pattern' can be kept as a token if it can be | /// \param[in] keep_delim_pattern The string matched by 'delim_pattern' can be kept as a token if it can be | ||||
| /// matched by 'keep_delim_pattern'. The default value is an empty string ("") | /// matched by 'keep_delim_pattern'. The default value is an empty string ("") | ||||
| /// which means that delimiters will not be kept as an output token (default=""). | /// which means that delimiters will not be kept as an output token (default=""). | ||||
| /// \param[in] with_offsets If or not output offsets of tokens (default=false). | |||||
| /// \param[in] with_offsets Whether or not output offsets of tokens (default=false). | |||||
| explicit RegexTokenizer(std::string delim_pattern, std::string keep_delim_pattern = "", bool with_offsets = false) | explicit RegexTokenizer(std::string delim_pattern, std::string keep_delim_pattern = "", bool with_offsets = false) | ||||
| : RegexTokenizer(StringToChar(delim_pattern), StringToChar(keep_delim_pattern), with_offsets) {} | : RegexTokenizer(StringToChar(delim_pattern), StringToChar(keep_delim_pattern), with_offsets) {} | ||||
| @@ -416,7 +447,7 @@ class TruncateSequencePair final : public TensorTransform { | |||||
| class UnicodeCharTokenizer final : public TensorTransform { | class UnicodeCharTokenizer final : public TensorTransform { | ||||
| public: | public: | ||||
| /// \brief Constructor. | /// \brief Constructor. | ||||
| /// \param[in] with_offsets If or not output offsets of tokens (default=false). | |||||
| /// \param[in] with_offsets Whether or not output offsets of tokens (default=false). | |||||
| explicit UnicodeCharTokenizer(bool with_offsets = false); | explicit UnicodeCharTokenizer(bool with_offsets = false); | ||||
| /// \brief Destructor | /// \brief Destructor | ||||
| @@ -432,13 +463,45 @@ class UnicodeCharTokenizer final : public TensorTransform { | |||||
| std::shared_ptr<Data> data_; | std::shared_ptr<Data> data_; | ||||
| }; | }; | ||||
| /// \brief Tokenize scalar token or 1-D tokens to 1-D subword tokens. | |||||
| class WordpieceTokenizer final : public TensorTransform { | |||||
| public: | |||||
| /// \brief Constructor. | |||||
| /// \param[in] vocab A Vocab object. | |||||
| /// \param[in] suffix_indicator Used to show that the subword is the last part of a word (default='##'). | |||||
| /// \param[in] max_bytes_per_token Tokens exceeding this length will not be further split (default=100). | |||||
| /// \param[in] unknown_token When a token cannot be found, return the token directly if 'unknown_token' is an empty | |||||
| /// string, else return the string specified (default='[UNK]'). | |||||
| /// \param[in] with_offsets Whether or not output offsets of tokens (default=false). | |||||
| explicit WordpieceTokenizer(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator = "##", | |||||
| int32_t max_bytes_per_token = 100, const std::string &unknown_token = "[UNK]", | |||||
| bool with_offsets = false) | |||||
| : WordpieceTokenizer(vocab, StringToChar(suffix_indicator), max_bytes_per_token, StringToChar(unknown_token), | |||||
| with_offsets) {} | |||||
| explicit WordpieceTokenizer(const std::shared_ptr<Vocab> &vocab, const std::vector<char> &suffix_indicator, | |||||
| int32_t max_bytes_per_token, const std::vector<char> &unknown_token, bool with_offsets); | |||||
| /// \brief Destructor | |||||
| ~WordpieceTokenizer() = default; | |||||
| protected: | |||||
| /// \brief Function to convert TensorTransform object into a TensorOperation object. | |||||
| /// \return Shared pointer to TensorOperation object. | |||||
| std::shared_ptr<TensorOperation> Parse() override; | |||||
| private: | |||||
| struct Data; | |||||
| std::shared_ptr<Data> data_; | |||||
| }; | |||||
| #ifndef _WIN32 | #ifndef _WIN32 | ||||
| /// \brief Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries. | /// \brief Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries. | ||||
| class UnicodeScriptTokenizer final : public TensorTransform { | class UnicodeScriptTokenizer final : public TensorTransform { | ||||
| public: | public: | ||||
| /// \brief Constructor. | /// \brief Constructor. | ||||
| /// \param[in] keep_whitespace If or not emit whitespace tokens (default=false). | |||||
| /// \param[in] with_offsets If or not output offsets of tokens (default=false). | |||||
| /// \param[in] keep_whitespace Whether or not emit whitespace tokens (default=false). | |||||
| /// \param[in] with_offsets Whether or not output offsets of tokens (default=false). | |||||
| explicit UnicodeScriptTokenizer(bool keep_whitespace = false, bool with_offsets = false); | explicit UnicodeScriptTokenizer(bool keep_whitespace = false, bool with_offsets = false); | ||||
| /// \brief Destructor | /// \brief Destructor | ||||
| @@ -458,7 +521,7 @@ class UnicodeScriptTokenizer final : public TensorTransform { | |||||
| class WhitespaceTokenizer final : public TensorTransform { | class WhitespaceTokenizer final : public TensorTransform { | ||||
| public: | public: | ||||
| /// \brief Constructor. | /// \brief Constructor. | ||||
| /// \param[in] with_offsets If or not output offsets of tokens (default=false). | |||||
| /// \param[in] with_offsets Whether or not output offsets of tokens (default=false). | |||||
| explicit WhitespaceTokenizer(bool with_offsets = false); | explicit WhitespaceTokenizer(bool with_offsets = false); | ||||
| /// \brief Destructor | /// \brief Destructor | ||||
| @@ -36,6 +36,7 @@ | |||||
| #include "minddata/dataset/text/kernels/to_number_op.h" | #include "minddata/dataset/text/kernels/to_number_op.h" | ||||
| #include "minddata/dataset/text/kernels/truncate_sequence_pair_op.h" | #include "minddata/dataset/text/kernels/truncate_sequence_pair_op.h" | ||||
| #include "minddata/dataset/text/kernels/unicode_char_tokenizer_op.h" | #include "minddata/dataset/text/kernels/unicode_char_tokenizer_op.h" | ||||
| #include "minddata/dataset/text/kernels/wordpiece_tokenizer_op.h" | |||||
| #ifndef _WIN32 | #ifndef _WIN32 | ||||
| #include "minddata/dataset/text/kernels/unicode_script_tokenizer_op.h" | #include "minddata/dataset/text/kernels/unicode_script_tokenizer_op.h" | ||||
| #include "minddata/dataset/text/kernels/whitespace_tokenizer_op.h" | #include "minddata/dataset/text/kernels/whitespace_tokenizer_op.h" | ||||
| @@ -396,6 +397,39 @@ std::shared_ptr<TensorOp> UnicodeCharTokenizerOperation::Build() { | |||||
| return tensor_op; | return tensor_op; | ||||
| } | } | ||||
| // WordpieceTokenizerOperation | |||||
| WordpieceTokenizerOperation::WordpieceTokenizerOperation(const std::shared_ptr<Vocab> &vocab, | |||||
| const std::string &suffix_indicator, | |||||
| int32_t max_bytes_per_token, const std::string &unknown_token, | |||||
| bool with_offsets) | |||||
| : vocab_(vocab), | |||||
| suffix_indicator_(suffix_indicator), | |||||
| max_bytes_per_token_(max_bytes_per_token), | |||||
| unknown_token_(unknown_token), | |||||
| with_offsets_(with_offsets) {} | |||||
| Status WordpieceTokenizerOperation::ValidateParams() { | |||||
| if (vocab_ == nullptr) { | |||||
| std::string err_msg = "WordpieceTokenizer: vocab object type is incorrect or null."; | |||||
| MS_LOG(ERROR) << err_msg; | |||||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||||
| } | |||||
| if (max_bytes_per_token_ < 0) { | |||||
| std::string err_msg = | |||||
| "WordpieceTokenizer : The parameter max_bytes_per_token must be greater than or equal to 0: " + | |||||
| std::to_string(max_bytes_per_token_); | |||||
| MS_LOG(ERROR) << err_msg; | |||||
| RETURN_STATUS_SYNTAX_ERROR(err_msg); | |||||
| } | |||||
| return Status::OK(); | |||||
| } | |||||
| std::shared_ptr<TensorOp> WordpieceTokenizerOperation::Build() { | |||||
| std::shared_ptr<WordpieceTokenizerOp> tensor_op = std::make_shared<WordpieceTokenizerOp>( | |||||
| vocab_, suffix_indicator_, max_bytes_per_token_, unknown_token_, with_offsets_); | |||||
| return tensor_op; | |||||
| } | |||||
| #ifndef _WIN32 | #ifndef _WIN32 | ||||
| // UnicodeScriptTokenizerOperation | // UnicodeScriptTokenizerOperation | ||||
| UnicodeScriptTokenizerOperation::UnicodeScriptTokenizerOperation(bool keep_whitespace, bool with_offsets) | UnicodeScriptTokenizerOperation::UnicodeScriptTokenizerOperation(bool keep_whitespace, bool with_offsets) | ||||
| @@ -49,6 +49,7 @@ constexpr char kTruncateSequencePairOperation[] = "TruncateSequencePair"; | |||||
| constexpr char kUnicodeCharTokenizerOperation[] = "UnicodeCharTokenizer"; | constexpr char kUnicodeCharTokenizerOperation[] = "UnicodeCharTokenizer"; | ||||
| constexpr char kUnicodeScriptTokenizerOperation[] = "UnicodeScriptTokenizer"; | constexpr char kUnicodeScriptTokenizerOperation[] = "UnicodeScriptTokenizer"; | ||||
| constexpr char kWhitespaceTokenizerOperation[] = "WhitespaceTokenizer"; | constexpr char kWhitespaceTokenizerOperation[] = "WhitespaceTokenizer"; | ||||
| constexpr char kWordpieceTokenizerOperation[] = "WordpieceTokenizer"; | |||||
| /* ####################################### Derived TensorOperation classes ################################# */ | /* ####################################### Derived TensorOperation classes ################################# */ | ||||
| @@ -318,6 +319,28 @@ class UnicodeCharTokenizerOperation : public TensorOperation { | |||||
| bool with_offsets_; | bool with_offsets_; | ||||
| }; | }; | ||||
| class WordpieceTokenizerOperation : public TensorOperation { | |||||
| public: | |||||
| explicit WordpieceTokenizerOperation(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator, | |||||
| int32_t max_bytes_per_token, const std::string &unknown_token, | |||||
| bool with_offsets); | |||||
| ~WordpieceTokenizerOperation() = default; | |||||
| std::shared_ptr<TensorOp> Build() override; | |||||
| Status ValidateParams() override; | |||||
| std::string Name() const override { return kWordpieceTokenizerOperation; } | |||||
| private: | |||||
| std::shared_ptr<Vocab> vocab_; | |||||
| std::string suffix_indicator_; | |||||
| int32_t max_bytes_per_token_; | |||||
| std::string unknown_token_; | |||||
| bool with_offsets_; | |||||
| }; | |||||
| #ifndef _WIN32 | #ifndef _WIN32 | ||||
| class UnicodeScriptTokenizerOperation : public TensorOperation { | class UnicodeScriptTokenizerOperation : public TensorOperation { | ||||
| public: | public: | ||||
| @@ -2193,7 +2193,7 @@ def _pyfunc_worker_init(pyfunc_list): | |||||
| # All exceptions will be raised to main processes | # All exceptions will be raised to main processes | ||||
| def _pyfunc_worker_exec(index, op_id, mapping, lock, record, *args): | def _pyfunc_worker_exec(index, op_id, mapping, lock, record, *args): | ||||
| """ | """ | ||||
| Internal function for call certain pyfunc in python process. | |||||
| Internal function for call certain pyfunc in Python process. | |||||
| """ | """ | ||||
| # Some threads in multiprocess.pool can't process sigint signal, | # Some threads in multiprocess.pool can't process sigint signal, | ||||
| # and will occur hang problem, so ctrl+c will pass to parent process. | # and will occur hang problem, so ctrl+c will pass to parent process. | ||||
| @@ -2366,7 +2366,7 @@ class MapDataset(Dataset): | |||||
| # Pass #1, look for Python callables and build list | # Pass #1, look for Python callables and build list | ||||
| for op in self.operations: | for op in self.operations: | ||||
| # our c transforms is now callable and should not be run in python multithreading | |||||
| # our c transforms is now callable and should not be run in Python multithreading | |||||
| if callable(op) and str(op).find("c_transform") < 0: | if callable(op) and str(op).find("c_transform") < 0: | ||||
| callable_list.append(op) | callable_list.append(op) | ||||
| @@ -2383,7 +2383,7 @@ class MapDataset(Dataset): | |||||
| _op_process = _manager.dict() | _op_process = _manager.dict() | ||||
| _process_lock = _manager.Lock() | _process_lock = _manager.Lock() | ||||
| for op in self.operations: | for op in self.operations: | ||||
| # our c transforms is now callable and should not be run in python multithreading | |||||
| # our c transforms is now callable and should not be run in Python multithreading | |||||
| if callable(op) and str(op).find("c_transform") < 0: | if callable(op) and str(op).find("c_transform") < 0: | ||||
| # Wrap Python callable into _PythonCallable | # Wrap Python callable into _PythonCallable | ||||
| iter_specific_operations.append(_PythonCallable(op, idx, op_id, _op_process, _process_lock, | iter_specific_operations.append(_PythonCallable(op, idx, op_id, _op_process, _process_lock, | ||||
| @@ -609,7 +609,7 @@ class SubsetSampler(BuiltinSampler): | |||||
| Samples the elements from a sequence of indices. | Samples the elements from a sequence of indices. | ||||
| Args: | Args: | ||||
| indices (Any iterable python object but string): A sequence of indices. | |||||
| indices (Any iterable Python object but string): A sequence of indices. | |||||
| num_samples (int, optional): Number of elements to sample (default=None, all elements). | num_samples (int, optional): Number of elements to sample (default=None, all elements). | ||||
| Examples: | Examples: | ||||
| @@ -101,7 +101,7 @@ class JiebaTokenizer(TextTensorOperation): | |||||
| - JiebaMode.MP, tokenize with MPSegment algorithm. | - JiebaMode.MP, tokenize with MPSegment algorithm. | ||||
| - JiebaMode.HMM, tokenize with Hiddel Markov Model Segment algorithm. | - JiebaMode.HMM, tokenize with Hiddel Markov Model Segment algorithm. | ||||
| - JiebaMode.MIX, tokenize with a mix of MPSegment and HMMSegment algorithm. | - JiebaMode.MIX, tokenize with a mix of MPSegment and HMMSegment algorithm. | ||||
| with_offsets (bool, optional): If or not output offsets of tokens (default=False). | |||||
| with_offsets (bool, optional): Whether or not output offsets of tokens (default=False). | |||||
| Examples: | Examples: | ||||
| >>> from mindspore.dataset.text import JiebaMode | >>> from mindspore.dataset.text import JiebaMode | ||||
| @@ -185,6 +185,9 @@ class JiebaTokenizer(TextTensorOperation): | |||||
| word2 None | word2 None | ||||
| word3 freq3 | word3 freq3 | ||||
| Only valid word-freq pairs in user provided file will be added into the dictionary. | |||||
| Rows containing invalid input will be ignored. No error nor warning Status is returned. | |||||
| Examples: | Examples: | ||||
| >>> from mindspore.dataset.text import JiebaMode | >>> from mindspore.dataset.text import JiebaMode | ||||
| >>> jieba_hmm_file = "/path/to/jieba/hmm/file" | >>> jieba_hmm_file = "/path/to/jieba/hmm/file" | ||||
| @@ -220,16 +223,16 @@ class JiebaTokenizer(TextTensorOperation): | |||||
| "user dict file {} is not exist.".format(file_path)) | "user dict file {} is not exist.".format(file_path)) | ||||
| real_file_path = os.path.realpath(file_path) | real_file_path = os.path.realpath(file_path) | ||||
| file_dict = open(real_file_path) | file_dict = open(real_file_path) | ||||
| data_re = re.compile('^(.+?)( [0-9]+)?$', re.U) | |||||
| data_re = re.compile('^\\s*([^\\s*]+?)\\s*([0-9]+)?\\s*$', re.U) | |||||
| words_list = [] | words_list = [] | ||||
| for item in file_dict: | for item in file_dict: | ||||
| data = item.strip() | data = item.strip() | ||||
| if not isinstance(data, str): | if not isinstance(data, str): | ||||
| data = self.__decode(data) | data = self.__decode(data) | ||||
| words = data_re.match(data).groups() | |||||
| if len(words) != 2: | |||||
| raise ValueError( | |||||
| "user dict file {} format error.".format(real_file_path)) | |||||
| tmp = data_re.match(data) | |||||
| if not tmp: | |||||
| continue | |||||
| words = tmp.groups() | |||||
| words_list.append(words) | words_list.append(words) | ||||
| file_dict.close() | file_dict.close() | ||||
| return words_list | return words_list | ||||
| @@ -447,7 +450,7 @@ class UnicodeCharTokenizer(TextTensorOperation): | |||||
| Tokenize a scalar tensor of UTF-8 string to Unicode characters. | Tokenize a scalar tensor of UTF-8 string to Unicode characters. | ||||
| Args: | Args: | ||||
| with_offsets (bool, optional): If or not output offsets of tokens (default=False). | |||||
| with_offsets (bool, optional): Whether or not output offsets of tokens (default=False). | |||||
| Examples: | Examples: | ||||
| >>> # If with_offsets=False, default output one column {["text", dtype=str]} | >>> # If with_offsets=False, default output one column {["text", dtype=str]} | ||||
| @@ -469,8 +472,7 @@ class UnicodeCharTokenizer(TextTensorOperation): | |||||
| return cde.UnicodeCharTokenizerOperation(self.with_offsets) | return cde.UnicodeCharTokenizerOperation(self.with_offsets) | ||||
| # TODO(alexyuyue): Need to decouple WordpieceTokenizerOp to WordpieceTokenizerOperation after it's supported in C++ | |||||
| class WordpieceTokenizer(cde.WordpieceTokenizerOp): | |||||
| class WordpieceTokenizer(TextTensorOperation): | |||||
| """ | """ | ||||
| Tokenize scalar token or 1-D tokens to 1-D subword tokens. | Tokenize scalar token or 1-D tokens to 1-D subword tokens. | ||||
| @@ -480,7 +482,7 @@ class WordpieceTokenizer(cde.WordpieceTokenizerOp): | |||||
| max_bytes_per_token (int, optional): Tokens exceeding this length will not be further split (default=100). | max_bytes_per_token (int, optional): Tokens exceeding this length will not be further split (default=100). | ||||
| unknown_token (str, optional): When a token cannot be found: if 'unknown_token' is empty string, | unknown_token (str, optional): When a token cannot be found: if 'unknown_token' is empty string, | ||||
| return the token directly, else return 'unknown_token' (default='[UNK]'). | return the token directly, else return 'unknown_token' (default='[UNK]'). | ||||
| with_offsets (bool, optional): If or not output offsets of tokens (default=False). | |||||
| with_offsets (bool, optional): Whether or not output offsets of tokens (default=False). | |||||
| Examples: | Examples: | ||||
| >>> vocab_list = ["book", "cholera", "era", "favor", "##ite", "my", "is", "love", "dur", "##ing", "the"] | >>> vocab_list = ["book", "cholera", "era", "favor", "##ite", "my", "is", "love", "dur", "##ing", "the"] | ||||
| @@ -506,8 +508,10 @@ class WordpieceTokenizer(cde.WordpieceTokenizerOp): | |||||
| self.max_bytes_per_token = max_bytes_per_token | self.max_bytes_per_token = max_bytes_per_token | ||||
| self.unknown_token = unknown_token | self.unknown_token = unknown_token | ||||
| self.with_offsets = with_offsets | self.with_offsets = with_offsets | ||||
| super().__init__(self.vocab, self.suffix_indicator, self.max_bytes_per_token, | |||||
| self.unknown_token, self.with_offsets) | |||||
| def parse(self): | |||||
| return cde.WordpieceTokenizerOperation(self.vocab, self.suffix_indicator, self.max_bytes_per_token, | |||||
| self.unknown_token, self.with_offsets) | |||||
| class PythonTokenizer: | class PythonTokenizer: | ||||
| @@ -561,7 +565,7 @@ if platform.system().lower() != 'windows': | |||||
| only effective when `lower_case` is False. See NormalizeUTF8 for details (default=NormalizeForm.NONE). | only effective when `lower_case` is False. See NormalizeUTF8 for details (default=NormalizeForm.NONE). | ||||
| preserve_unused_token (bool, optional): If True, do not split special tokens like | preserve_unused_token (bool, optional): If True, do not split special tokens like | ||||
| '[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]' (default=True). | '[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]' (default=True). | ||||
| with_offsets (bool, optional): If or not output offsets of tokens (default=False). | |||||
| with_offsets (bool, optional): Whether or not output offsets of tokens (default=False). | |||||
| Examples: | Examples: | ||||
| >>> from mindspore.dataset.text import NormalizeForm | >>> from mindspore.dataset.text import NormalizeForm | ||||
| @@ -627,7 +631,7 @@ if platform.system().lower() != 'windows': | |||||
| only effective when `lower_case` is False. See NormalizeUTF8 for details (default=NormalizeForm.NONE). | only effective when `lower_case` is False. See NormalizeUTF8 for details (default=NormalizeForm.NONE). | ||||
| preserve_unused_token (bool, optional): If True, do not split special tokens like | preserve_unused_token (bool, optional): If True, do not split special tokens like | ||||
| '[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]' (default=True). | '[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]' (default=True). | ||||
| with_offsets (bool, optional): If or not output offsets of tokens (default=False). | |||||
| with_offsets (bool, optional): Whether or not output offsets of tokens (default=False). | |||||
| Examples: | Examples: | ||||
| >>> from mindspore.dataset.text import NormalizeForm | >>> from mindspore.dataset.text import NormalizeForm | ||||
| @@ -782,7 +786,7 @@ if platform.system().lower() != 'windows': | |||||
| keep_delim_pattern (str, optional): The string matched by 'delim_pattern' can be kept as a token | keep_delim_pattern (str, optional): The string matched by 'delim_pattern' can be kept as a token | ||||
| if it can be matched by 'keep_delim_pattern'. The default value is an empty str ('') | if it can be matched by 'keep_delim_pattern'. The default value is an empty str ('') | ||||
| which means that delimiters will not be kept as an output token (default=''). | which means that delimiters will not be kept as an output token (default=''). | ||||
| with_offsets (bool, optional): If or not output offsets of tokens (default=False). | |||||
| with_offsets (bool, optional): Whether or not output offsets of tokens (default=False). | |||||
| Examples: | Examples: | ||||
| >>> # If with_offsets=False, default output one column {["text", dtype=str]} | >>> # If with_offsets=False, default output one column {["text", dtype=str]} | ||||
| @@ -818,8 +822,8 @@ if platform.system().lower() != 'windows': | |||||
| UnicodeScriptTokenizer is not supported on Windows platform yet. | UnicodeScriptTokenizer is not supported on Windows platform yet. | ||||
| Args: | Args: | ||||
| keep_whitespace (bool, optional): If or not emit whitespace tokens (default=False). | |||||
| with_offsets (bool, optional): If or not output offsets of tokens (default=False). | |||||
| keep_whitespace (bool, optional): Whether or not emit whitespace tokens (default=False). | |||||
| with_offsets (bool, optional): Whether or not output offsets of tokens (default=False). | |||||
| Examples: | Examples: | ||||
| >>> # If with_offsets=False, default output one column {["text", dtype=str]} | >>> # If with_offsets=False, default output one column {["text", dtype=str]} | ||||
| @@ -854,7 +858,7 @@ if platform.system().lower() != 'windows': | |||||
| WhitespaceTokenizer is not supported on Windows platform yet. | WhitespaceTokenizer is not supported on Windows platform yet. | ||||
| Args: | Args: | ||||
| with_offsets (bool, optional): If or not output offsets of tokens (default=False). | |||||
| with_offsets (bool, optional): Whether or not output offsets of tokens (default=False). | |||||
| Examples: | Examples: | ||||
| >>> # If with_offsets=False, default output one column {["text", dtype=str]} | >>> # If with_offsets=False, default output one column {["text", dtype=str]} | ||||
| @@ -988,7 +988,7 @@ TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord) { | |||||
| EXPECT_NE(jieba_tokenizer, nullptr); | EXPECT_NE(jieba_tokenizer, nullptr); | ||||
| // Add word with freq not provided (default 0) | // Add word with freq not provided (default 0) | ||||
| jieba_tokenizer->AddWord("男默女泪"); | |||||
| ASSERT_OK(jieba_tokenizer->AddWord("男默女泪")); | |||||
| // Create Map operation on ds | // Create Map operation on ds | ||||
| ds = ds->Map({jieba_tokenizer}, {"text"}); | ds = ds->Map({jieba_tokenizer}, {"text"}); | ||||
| @@ -1038,7 +1038,7 @@ TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord1) { | |||||
| EXPECT_NE(jieba_tokenizer, nullptr); | EXPECT_NE(jieba_tokenizer, nullptr); | ||||
| // Add word with freq is set explicitly to 0 | // Add word with freq is set explicitly to 0 | ||||
| jieba_tokenizer->AddWord("男默女泪", 0); | |||||
| ASSERT_OK(jieba_tokenizer->AddWord("男默女泪", 0)); | |||||
| // Create Map operation on ds | // Create Map operation on ds | ||||
| ds = ds->Map({jieba_tokenizer}, {"text"}); | ds = ds->Map({jieba_tokenizer}, {"text"}); | ||||
| @@ -1088,7 +1088,7 @@ TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord2) { | |||||
| EXPECT_NE(jieba_tokenizer, nullptr); | EXPECT_NE(jieba_tokenizer, nullptr); | ||||
| // Add word with freq 10 | // Add word with freq 10 | ||||
| jieba_tokenizer->AddWord("男默女泪", 10); | |||||
| ASSERT_OK(jieba_tokenizer->AddWord("男默女泪", 10)); | |||||
| // Create Map operation on ds | // Create Map operation on ds | ||||
| ds = ds->Map({jieba_tokenizer}, {"text"}); | ds = ds->Map({jieba_tokenizer}, {"text"}); | ||||
| @@ -1138,7 +1138,7 @@ TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord3) { | |||||
| EXPECT_NE(jieba_tokenizer, nullptr); | EXPECT_NE(jieba_tokenizer, nullptr); | ||||
| // Add word with freq 20000 | // Add word with freq 20000 | ||||
| jieba_tokenizer->AddWord("江大桥", 20000); | |||||
| ASSERT_OK(jieba_tokenizer->AddWord("江大桥", 20000)); | |||||
| // Create Map operation on ds | // Create Map operation on ds | ||||
| ds = ds->Map({jieba_tokenizer}, {"text"}); | ds = ds->Map({jieba_tokenizer}, {"text"}); | ||||
| @@ -1194,6 +1194,115 @@ TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWordFail) { | |||||
| EXPECT_NE(jieba_tokenizer1->AddWord("我们", -1), Status::OK()); | EXPECT_NE(jieba_tokenizer1->AddWord("我们", -1), Status::OK()); | ||||
| } | } | ||||
| TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddDict) { | |||||
| // Testing AddDict of JiebaTokenizer when the input is a vector of word-freq pair. | |||||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddDict."; | |||||
| // Create a TextFile dataset | |||||
| std::string data_file = datasets_root_path_ + "/testJiebaDataset/6.txt"; | |||||
| std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8"; | |||||
| std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8"; | |||||
| std::shared_ptr<Dataset> ds = TextFile({data_file}); | |||||
| EXPECT_NE(ds, nullptr); | |||||
| // Create jieba_tokenizer operation on ds | |||||
| std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer = | |||||
| std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp); | |||||
| EXPECT_NE(jieba_tokenizer, nullptr); | |||||
| // Add word with freq 20000 | |||||
| std::vector<std::pair<std::string, int64_t>> user_dict = {{"江大桥", 20000}}; | |||||
| ASSERT_OK(jieba_tokenizer->AddDict(user_dict)); | |||||
| // Create Map operation on ds | |||||
| ds = ds->Map({jieba_tokenizer}, {"text"}); | |||||
| EXPECT_NE(ds, nullptr); | |||||
| // Create an iterator over the result of the above dataset | |||||
| // This will trigger the creation of the Execution Tree and launch it. | |||||
| std::shared_ptr<Iterator> iter = ds->CreateIterator(); | |||||
| EXPECT_NE(iter, nullptr); | |||||
| // Iterate the dataset and get each row | |||||
| std::unordered_map<std::string, mindspore::MSTensor> row; | |||||
| iter->GetNextRow(&row); | |||||
| std::vector<std::string> expected = {"江州", "市长", "江大桥", "参加", "了", "长江大桥", "的", "通车", "仪式"}; | |||||
| std::shared_ptr<Tensor> de_expected_tensor; | |||||
| ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor)); | |||||
| mindspore::MSTensor expected_tensor = | |||||
| mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor)); | |||||
| uint64_t i = 0; | |||||
| while (row.size() != 0) { | |||||
| auto txt = row["text"]; | |||||
| EXPECT_MSTENSOR_EQ(txt, expected_tensor); | |||||
| iter->GetNextRow(&row); | |||||
| i++; | |||||
| } | |||||
| EXPECT_EQ(i, 1); | |||||
| // Manually terminate the pipeline | |||||
| iter->Stop(); | |||||
| } | |||||
| TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddDictFromFile) { | |||||
| // Testing AddDict of JiebaTokenizer when the input is a path to dict. | |||||
| // Test error scenario for AddDict: invalid path | |||||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddDictFromFile."; | |||||
| // Create a TextFile dataset | |||||
| std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt"; | |||||
| std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8"; | |||||
| std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8"; | |||||
| std::shared_ptr<Dataset> ds = TextFile({data_file}); | |||||
| EXPECT_NE(ds, nullptr); | |||||
| // Create jieba_tokenizer operation on ds | |||||
| std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer = | |||||
| std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp); | |||||
| EXPECT_NE(jieba_tokenizer, nullptr); | |||||
| // Load dict from txt file | |||||
| std::string user_dict_path = datasets_root_path_ + "/testJiebaDataset/user_dict.txt"; | |||||
| std::string invalid_path = datasets_root_path_ + "/testJiebaDataset/invalid_path.txt"; | |||||
| EXPECT_ERROR(jieba_tokenizer->AddDict(invalid_path)); | |||||
| ASSERT_OK(jieba_tokenizer->AddDict(user_dict_path)); | |||||
| // Create Map operation on ds | |||||
| ds = ds->Map({jieba_tokenizer}, {"text"}); | |||||
| EXPECT_NE(ds, nullptr); | |||||
| // Create an iterator over the result of the above dataset | |||||
| // This will trigger the creation of the Execution Tree and launch it. | |||||
| std::shared_ptr<Iterator> iter = ds->CreateIterator(); | |||||
| EXPECT_NE(iter, nullptr); | |||||
| // Iterate the dataset and get each row | |||||
| std::unordered_map<std::string, mindspore::MSTensor> row; | |||||
| iter->GetNextRow(&row); | |||||
| std::vector<std::string> expected = {"今天天气", "太好了", "我们", "一起", "去", "外面", "玩吧"}; | |||||
| std::shared_ptr<Tensor> de_expected_tensor; | |||||
| ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor)); | |||||
| mindspore::MSTensor expected_tensor = | |||||
| mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor)); | |||||
| uint64_t i = 0; | |||||
| while (row.size() != 0) { | |||||
| auto txt = row["text"]; | |||||
| EXPECT_MSTENSOR_EQ(txt, expected_tensor); | |||||
| iter->GetNextRow(&row); | |||||
| i++; | |||||
| } | |||||
| EXPECT_EQ(i, 1); | |||||
| // Manually terminate the pipeline | |||||
| iter->Stop(); | |||||
| } | |||||
| TEST_F(MindDataTestPipeline, TestSlidingWindowSuccess) { | TEST_F(MindDataTestPipeline, TestSlidingWindowSuccess) { | ||||
| // Testing the parameter of SlidingWindow interface when the axis is 0. | // Testing the parameter of SlidingWindow interface when the axis is 0. | ||||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSlidingWindowSuccess."; | MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSlidingWindowSuccess."; | ||||
| @@ -2523,6 +2632,421 @@ TEST_F(MindDataTestPipeline, TestUnicodeCharTokenizerSuccess1) { | |||||
| iter->Stop(); | iter->Stop(); | ||||
| } | } | ||||
| std::vector<std::string> vocab_english = {"book", "cholera", "era", "favor", "##ite", "my", | |||||
| "is", "love", "dur", "##ing", "the"}; | |||||
| std::vector<std::string> vocab_chinese = {"我", "最", "喜", "欢", "的", "书", "是", "霍", "乱", "时", "期", "爱", "情"}; | |||||
| TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess1) { | |||||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess1."; | |||||
| // Test WordpieceTokenizer with default parameters on English vocab | |||||
| // Create a TextFile dataset | |||||
| std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt"; | |||||
| std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); | |||||
| EXPECT_NE(ds, nullptr); | |||||
| // Create Take operation on ds | |||||
| ds = ds->Take(10); | |||||
| EXPECT_NE(ds, nullptr); | |||||
| // Create a vocab from vector | |||||
| std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); | |||||
| Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab); | |||||
| EXPECT_EQ(s, Status::OK()); | |||||
| // Create WordpieceTokenizer operation on ds | |||||
| std::shared_ptr<TensorTransform> wordpiece_tokenizer = std::make_shared<text::WordpieceTokenizer>(vocab); | |||||
| EXPECT_NE(wordpiece_tokenizer, nullptr); | |||||
| // Create Map operation on ds | |||||
| ds = ds->Map({wordpiece_tokenizer}, {"text"}); | |||||
| EXPECT_NE(ds, nullptr); | |||||
| // Create an iterator over the result of the above dataset | |||||
| // This will trigger the creation of the Execution Tree and launch it. | |||||
| std::shared_ptr<Iterator> iter = ds->CreateIterator(); | |||||
| EXPECT_NE(iter, nullptr); | |||||
| // Iterate the dataset and get each row | |||||
| std::unordered_map<std::string, mindspore::MSTensor> row; | |||||
| iter->GetNextRow(&row); | |||||
| std::vector<std::vector<std::string>> expected = { | |||||
| {"my"}, {"favor", "##ite"}, {"book"}, {"is"}, {"love"}, {"dur", "##ing"}, {"the"}, {"cholera"}, {"era"}, {"[UNK]"}}; | |||||
| uint64_t i = 0; | |||||
| while (row.size() != 0) { | |||||
| auto txt = row["text"]; | |||||
| std::shared_ptr<Tensor> de_expected_tensor; | |||||
| ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor)); | |||||
| mindspore::MSTensor expected_tensor = | |||||
| mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor)); | |||||
| EXPECT_MSTENSOR_EQ(txt, expected_tensor); | |||||
| iter->GetNextRow(&row); | |||||
| i++; | |||||
| } | |||||
| EXPECT_EQ(i, 10); | |||||
| // Manually terminate the pipeline | |||||
| iter->Stop(); | |||||
| } | |||||
| TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess2) { | |||||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess2."; | |||||
| // Test WordpieceTokenizer with empty unknown_token | |||||
| // Create a TextFile dataset | |||||
| std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt"; | |||||
| std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); | |||||
| EXPECT_NE(ds, nullptr); | |||||
| // Create Take operation on ds | |||||
| ds = ds->Take(10); | |||||
| EXPECT_NE(ds, nullptr); | |||||
| // Create a vocab from vector | |||||
| std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); | |||||
| Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab); | |||||
| EXPECT_EQ(s, Status::OK()); | |||||
| // Create WordpieceTokenizer operation on ds | |||||
| std::shared_ptr<TensorTransform> wordpiece_tokenizer = | |||||
| std::make_shared<text::WordpieceTokenizer>(vocab, "##", 100, "", false); | |||||
| EXPECT_NE(wordpiece_tokenizer, nullptr); | |||||
| // Create Map operation on ds | |||||
| ds = ds->Map({wordpiece_tokenizer}, {"text"}); | |||||
| EXPECT_NE(ds, nullptr); | |||||
| // Create an iterator over the result of the above dataset | |||||
| // This will trigger the creation of the Execution Tree and launch it. | |||||
| std::shared_ptr<Iterator> iter = ds->CreateIterator(); | |||||
| EXPECT_NE(iter, nullptr); | |||||
| // Iterate the dataset and get each row | |||||
| std::unordered_map<std::string, mindspore::MSTensor> row; | |||||
| iter->GetNextRow(&row); | |||||
| std::vector<std::vector<std::string>> expected = { | |||||
| {"my"}, {"favor", "##ite"}, {"book"}, {"is"}, {"love"}, {"dur", "##ing"}, {"the"}, {"cholera"}, {"era"}, {"what"}}; | |||||
| uint64_t i = 0; | |||||
| while (row.size() != 0) { | |||||
| auto txt = row["text"]; | |||||
| std::shared_ptr<Tensor> de_expected_tensor; | |||||
| ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor)); | |||||
| mindspore::MSTensor expected_tensor = | |||||
| mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor)); | |||||
| EXPECT_MSTENSOR_EQ(txt, expected_tensor); | |||||
| iter->GetNextRow(&row); | |||||
| i++; | |||||
| } | |||||
| EXPECT_EQ(i, 10); | |||||
| // Manually terminate the pipeline | |||||
| iter->Stop(); | |||||
| } | |||||
| TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess3) { | |||||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess3."; | |||||
| // Test WordpieceTokenizer with non-default max_bytes_per_token | |||||
| // Create a TextFile dataset | |||||
| std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt"; | |||||
| std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); | |||||
| EXPECT_NE(ds, nullptr); | |||||
| // Create Take operation on ds | |||||
| ds = ds->Take(10); | |||||
| EXPECT_NE(ds, nullptr); | |||||
| // Create a vocab from vector | |||||
| std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); | |||||
| Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab); | |||||
| EXPECT_EQ(s, Status::OK()); | |||||
| // Create WordpieceTokenizer operation on ds | |||||
| std::shared_ptr<TensorTransform> wordpiece_tokenizer = | |||||
| std::make_shared<text::WordpieceTokenizer>(vocab, "##", 4, "[UNK]", false); | |||||
| EXPECT_NE(wordpiece_tokenizer, nullptr); | |||||
| // Create Map operation on ds | |||||
| ds = ds->Map({wordpiece_tokenizer}, {"text"}); | |||||
| EXPECT_NE(ds, nullptr); | |||||
| // Create an iterator over the result of the above dataset | |||||
| // This will trigger the creation of the Execution Tree and launch it. | |||||
| std::shared_ptr<Iterator> iter = ds->CreateIterator(); | |||||
| EXPECT_NE(iter, nullptr); | |||||
| // Iterate the dataset and get each row | |||||
| std::unordered_map<std::string, mindspore::MSTensor> row; | |||||
| iter->GetNextRow(&row); | |||||
| std::vector<std::vector<std::string>> expected = {{"my"}, {"[UNK]"}, {"book"}, {"is"}, {"love"}, | |||||
| {"[UNK]"}, {"the"}, {"[UNK]"}, {"era"}, {"[UNK]"}}; | |||||
| uint64_t i = 0; | |||||
| while (row.size() != 0) { | |||||
| auto txt = row["text"]; | |||||
| std::shared_ptr<Tensor> de_expected_tensor; | |||||
| ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor)); | |||||
| mindspore::MSTensor expected_tensor = | |||||
| mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor)); | |||||
| EXPECT_MSTENSOR_EQ(txt, expected_tensor); | |||||
| iter->GetNextRow(&row); | |||||
| i++; | |||||
| } | |||||
| EXPECT_EQ(i, 10); | |||||
| // Manually terminate the pipeline | |||||
| iter->Stop(); | |||||
| } | |||||
| TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess4) { | |||||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess4."; | |||||
| // Test WordpieceTokenizer with default parameters on Chinese vocab | |||||
| // Create a TextFile dataset | |||||
| std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt"; | |||||
| std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); | |||||
| EXPECT_NE(ds, nullptr); | |||||
| // Create Skip operation on ds | |||||
| ds = ds->Skip(10); | |||||
| EXPECT_NE(ds, nullptr); | |||||
| // Create Take operation on ds | |||||
| ds = ds->Take(15); | |||||
| EXPECT_NE(ds, nullptr); | |||||
| // Create a vocab from vector | |||||
| std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); | |||||
| Status s = Vocab::BuildFromVector(vocab_chinese, {}, true, &vocab); | |||||
| EXPECT_EQ(s, Status::OK()); | |||||
| // Create WordpieceTokenizer operation on ds | |||||
| std::shared_ptr<TensorTransform> wordpiece_tokenizer = | |||||
| std::make_shared<text::WordpieceTokenizer>(vocab, "##", 100, "[UNK]", false); | |||||
| EXPECT_NE(wordpiece_tokenizer, nullptr); | |||||
| // Create Map operation on ds | |||||
| ds = ds->Map({wordpiece_tokenizer}, {"text"}); | |||||
| EXPECT_NE(ds, nullptr); | |||||
| // Create an iterator over the result of the above dataset | |||||
| // This will trigger the creation of the Execution Tree and launch it. | |||||
| std::shared_ptr<Iterator> iter = ds->CreateIterator(); | |||||
| EXPECT_NE(iter, nullptr); | |||||
| // Iterate the dataset and get each row | |||||
| std::unordered_map<std::string, mindspore::MSTensor> row; | |||||
| iter->GetNextRow(&row); | |||||
| std::vector<std::vector<std::string>> expected = {{"我"}, {"最"}, {"喜"}, {"欢"}, {"的"}, {"书"}, {"是"}, {"霍"}, | |||||
| {"乱"}, {"时"}, {"期"}, {"的"}, {"爱"}, {"情"}, {"[UNK]"}}; | |||||
| uint64_t i = 0; | |||||
| while (row.size() != 0) { | |||||
| auto txt = row["text"]; | |||||
| std::shared_ptr<Tensor> de_expected_tensor; | |||||
| ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor)); | |||||
| mindspore::MSTensor expected_tensor = | |||||
| mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor)); | |||||
| EXPECT_MSTENSOR_EQ(txt, expected_tensor); | |||||
| iter->GetNextRow(&row); | |||||
| i++; | |||||
| } | |||||
| EXPECT_EQ(i, 15); | |||||
| // Manually terminate the pipeline | |||||
| iter->Stop(); | |||||
| } | |||||
| TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess5) { | |||||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess5."; | |||||
| // Test WordpieceTokenizer with with_offsets true | |||||
| // Create a TextFile dataset | |||||
| std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt"; | |||||
| std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); | |||||
| EXPECT_NE(ds, nullptr); | |||||
| // Create Take operation on ds | |||||
| ds = ds->Take(10); | |||||
| EXPECT_NE(ds, nullptr); | |||||
| // Create a vocab from vector | |||||
| std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); | |||||
| Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab); | |||||
| EXPECT_EQ(s, Status::OK()); | |||||
| // Create WordpieceTokenizer operation on ds | |||||
| std::shared_ptr<TensorTransform> wordpiece_tokenizer = | |||||
| std::make_shared<text::WordpieceTokenizer>(vocab, "##", 100, "[UNK]", true); | |||||
| EXPECT_NE(wordpiece_tokenizer, nullptr); | |||||
| // Create Map operation on ds | |||||
| ds = ds->Map({wordpiece_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"}); | |||||
| EXPECT_NE(ds, nullptr); | |||||
| // Create an iterator over the result of the above dataset | |||||
| // This will trigger the creation of the Execution Tree and launch it. | |||||
| std::shared_ptr<Iterator> iter = ds->CreateIterator(); | |||||
| EXPECT_NE(iter, nullptr); | |||||
| // Iterate the dataset and get each row | |||||
| std::unordered_map<std::string, mindspore::MSTensor> row; | |||||
| iter->GetNextRow(&row); | |||||
| std::vector<std::vector<std::string>> expected = { | |||||
| {"my"}, {"favor", "##ite"}, {"book"}, {"is"}, {"love"}, {"dur", "##ing"}, {"the"}, {"cholera"}, {"era"}, {"[UNK]"}}; | |||||
| std::vector<std::vector<uint32_t>> expected_offsets_start = {{0}, {0, 5}, {0}, {0}, {0}, {0, 3}, {0}, {0}, {0}, {0}}; | |||||
| std::vector<std::vector<uint32_t>> expected_offsets_limit = {{2}, {5, 8}, {4}, {2}, {4}, {3, 6}, {3}, {7}, {3}, {4}}; | |||||
| uint64_t i = 0; | |||||
| while (row.size() != 0) { | |||||
| auto txt = row["token"]; | |||||
| std::shared_ptr<Tensor> de_expected_tensor; | |||||
| ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor)); | |||||
| mindspore::MSTensor expected_tensor = | |||||
| mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor)); | |||||
| EXPECT_MSTENSOR_EQ(txt, expected_tensor); | |||||
| auto start = row["offsets_start"]; | |||||
| std::shared_ptr<Tensor> de_expected_start_tensor; | |||||
| ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start[i], &de_expected_start_tensor)); | |||||
| mindspore::MSTensor expected_start_tensor = | |||||
| mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_start_tensor)); | |||||
| EXPECT_MSTENSOR_EQ(start, expected_start_tensor); | |||||
| auto limit = row["offsets_limit"]; | |||||
| std::shared_ptr<Tensor> de_expected_limit_tensor; | |||||
| ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit[i], &de_expected_limit_tensor)); | |||||
| mindspore::MSTensor expected_limit_tensor = | |||||
| mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_limit_tensor)); | |||||
| EXPECT_MSTENSOR_EQ(limit, expected_limit_tensor); | |||||
| iter->GetNextRow(&row); | |||||
| i++; | |||||
| } | |||||
| EXPECT_EQ(i, 10); | |||||
| // Manually terminate the pipeline | |||||
| iter->Stop(); | |||||
| } | |||||
| TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess6) { | |||||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess6."; | |||||
| // Test WordpieceTokenizer with max_bytes_per_token equals to 0 | |||||
| // Create a TextFile dataset | |||||
| std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt"; | |||||
| std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); | |||||
| EXPECT_NE(ds, nullptr); | |||||
| // Create Take operation on ds | |||||
| ds = ds->Take(10); | |||||
| EXPECT_NE(ds, nullptr); | |||||
| // Create a vocab from vector | |||||
| std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); | |||||
| Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab); | |||||
| EXPECT_EQ(s, Status::OK()); | |||||
| // Create WordpieceTokenizer operation on ds | |||||
| std::shared_ptr<TensorTransform> wordpiece_tokenizer = | |||||
| std::make_shared<text::WordpieceTokenizer>(vocab, "##", 0, "[UNK]", true); | |||||
| EXPECT_NE(wordpiece_tokenizer, nullptr); | |||||
| // Create Map operation on ds | |||||
| ds = ds->Map({wordpiece_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"}); | |||||
| EXPECT_NE(ds, nullptr); | |||||
| // Create an iterator over the result of the above dataset | |||||
| // This will trigger the creation of the Execution Tree and launch it. | |||||
| std::shared_ptr<Iterator> iter = ds->CreateIterator(); | |||||
| EXPECT_NE(iter, nullptr); | |||||
| // Iterate the dataset and get each row | |||||
| std::unordered_map<std::string, mindspore::MSTensor> row; | |||||
| iter->GetNextRow(&row); | |||||
| std::vector<std::vector<std::string>> expected = {{"[UNK]"}, {"[UNK]"}, {"[UNK]"}, {"[UNK]"}, {"[UNK]"}, | |||||
| {"[UNK]"}, {"[UNK]"}, {"[UNK]"}, {"[UNK]"}, {"[UNK]"}}; | |||||
| uint64_t i = 0; | |||||
| while (row.size() != 0) { | |||||
| auto txt = row["token"]; | |||||
| std::shared_ptr<Tensor> de_expected_tensor; | |||||
| ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor)); | |||||
| mindspore::MSTensor expected_tensor = | |||||
| mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor)); | |||||
| EXPECT_MSTENSOR_EQ(txt, expected_tensor); | |||||
| iter->GetNextRow(&row); | |||||
| i++; | |||||
| } | |||||
| EXPECT_EQ(i, 10); | |||||
| // Manually terminate the pipeline | |||||
| iter->Stop(); | |||||
| } | |||||
| TEST_F(MindDataTestPipeline, TestWordpieceTokenizerFail1) { | |||||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerFail1."; | |||||
| // Test WordpieceTokenizer with nullptr vocab | |||||
| // Create a TextFile dataset | |||||
| std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt"; | |||||
| std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); | |||||
| EXPECT_NE(ds, nullptr); | |||||
| // Create WordpieceTokenizer operation on ds | |||||
| std::shared_ptr<TensorTransform> wordpiece_tokenizer = std::make_shared<text::WordpieceTokenizer>(nullptr); | |||||
| EXPECT_NE(wordpiece_tokenizer, nullptr); | |||||
| // Create a Map operation on ds | |||||
| ds = ds->Map({wordpiece_tokenizer}); | |||||
| EXPECT_NE(ds, nullptr); | |||||
| std::shared_ptr<Iterator> iter = ds->CreateIterator(); | |||||
| // Expect failure: invalid WordpieceTokenizer input with nullptr vocab | |||||
| EXPECT_EQ(iter, nullptr); | |||||
| } | |||||
| TEST_F(MindDataTestPipeline, TestWordpieceTokenizerFail2) { | |||||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerFail2."; | |||||
| // Test WordpieceTokenizer with negative max_bytes_per_token | |||||
| // Create a TextFile dataset | |||||
| std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt"; | |||||
| std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); | |||||
| EXPECT_NE(ds, nullptr); | |||||
| // Create a vocab from vector | |||||
| std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); | |||||
| Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab); | |||||
| EXPECT_EQ(s, Status::OK()); | |||||
| // Create WordpieceTokenizer operation on ds | |||||
| std::shared_ptr<TensorTransform> wordpiece_tokenizer = std::make_shared<text::WordpieceTokenizer>(vocab, "##", -1); | |||||
| EXPECT_NE(wordpiece_tokenizer, nullptr); | |||||
| // Create a Map operation on ds | |||||
| ds = ds->Map({wordpiece_tokenizer}); | |||||
| EXPECT_NE(ds, nullptr); | |||||
| std::shared_ptr<Iterator> iter = ds->CreateIterator(); | |||||
| // Expect failure: invalid WordpieceTokenizer input with nullptr vocab | |||||
| EXPECT_EQ(iter, nullptr); | |||||
| } | |||||
| TEST_F(MindDataTestPipeline, TestUnicodeScriptTokenizerSuccess) { | TEST_F(MindDataTestPipeline, TestUnicodeScriptTokenizerSuccess) { | ||||
| // Testing the parameter of UnicodeScriptTokenizer interface when the with_offsets and the keep_whitespace is default. | // Testing the parameter of UnicodeScriptTokenizer interface when the with_offsets and the keep_whitespace is default. | ||||
| MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeScriptTokenizerSuccess."; | MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeScriptTokenizerSuccess."; | ||||