|
- # coding=utf-8
- # Copyright 2020 The HuggingFace Inc. team.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- """
- Tokenization classes for python tokenizers. For fast tokenizers (provided by HuggingFace's tokenizers library) see
- tokenization_utils_fast.py
- """
- import bisect
- import itertools
- import re
- import unicodedata
- from collections import OrderedDict
- from typing import Any, Dict, List, Optional, Tuple, Union, overload
-
- from .file_utils import PaddingStrategy, TensorType, add_end_docstrings
- from .tokenization_utils_base import (
- ENCODE_KWARGS_DOCSTRING,
- ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING,
- INIT_TOKENIZER_DOCSTRING,
- AddedToken,
- BatchEncoding,
- EncodedInput,
- EncodedInputPair,
- PreTokenizedInput,
- PreTokenizedInputPair,
- PreTrainedTokenizerBase,
- TextInput,
- TextInputPair,
- TruncationStrategy,
- )
-
- from fastNLP.core.log import logger
-
- # Slow tokenizers are saved in a vocabulary plus three separated files
- SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"
- ADDED_TOKENS_FILE = "added_tokens.json"
- TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
-
-
- class Trie:
- """
- Trie in Python. Creates a Trie out of a list of words. The trie is used to split on `added_tokens` in one pass
- Loose reference https://en.wikipedia.org/wiki/Trie
- """
-
- def __init__(self):
- self.data = {}
-
- def add(self, word: str):
- """
- Passes over every char (utf-8 char) on word and recursively adds it to the internal `data` trie representation.
- The special key `""` is used to represent termination.
-
- This function is idempotent, adding twice the same word will leave the trie unchanged
-
- Example::
-
- >>> trie = Trie()
- >>> trie.add("Hello 友達")
- >>> trie.data
- {"H": {"e": {"l": {"l": {"o": {" ": {"友": {"達": {"": 1}}}}}}}}}
- >>> trie.add("Hello")
- >>> trie.data
- {"H": {"e": {"l": {"l": {"o": {"": 1, " ": {"友": {"達": {"": 1}}}}}}}}}
- """
- if not word:
- # Prevent empty string
- return
- ref = self.data
- for char in word:
- ref[char] = char in ref and ref[char] or {}
- ref = ref[char]
- ref[""] = 1
-
- def split(self, text: str) -> List[str]:
- """
- Will look for the words added to the trie within `text`. Output is the original string splitted along the
- boundaries of the words found.
-
- This trie will match the longest possible word first !
-
- Example::
-
- >>> trie = Trie()
- >>> trie.split("[CLS] This is a extra_id_100")
- ["[CLS] This is a extra_id_100"]
- >>> trie.add("[CLS]")
- >>> trie.add("extra_id_1")
- >>> trie.add("extra_id_100")
- >>> trie.split("[CLS] This is a extra_id_100")
- ["[CLS]", " This is a ", "extra_id_100"]
- """
- # indexes are counted left of the chars index.
- # "hello", index 0, is left of h, index 1 is between h and e.
- # index 5 is right of the "o".
-
- # States are going to capture every possible start (indexes as above)
- # as keys, and have as values, a pointer to the position in the trie
- # where we're at. This is a partial match for now.
- # This enables to keep track of multiple matches while we're iterating
- # the string
- # If the trie contains, "blowing", and "lower" and we encounter the
- # string "blower", we need to split into ["b", "lower"].
- # This is where we need to keep track of multiple possible starts.
- states = OrderedDict()
-
- # This will contain every indices where we need
- # to cut.
- # We force to cut at offset 0 and len(text) (added later)
- offsets = [0]
-
- # This is used by the lookahead which needs to skip over
- # some text where the full match exceeded the place in the initial
- # for loop
- skip = None
- # Main loop, Giving this algorithm O(n) complexity
- for current, current_char in enumerate(text):
- if skip and current < skip:
- # Prevents the lookahead for matching twice
- # like extra_id_100 and id_100
- continue
-
- # This will track every state
- # that stop matching, we need to stop tracking them.
- # If we look at "lowball", we're going to match "l" (add it to states), "o", "w", then
- # fail on "b", we need to remove 0 from the valid states.
- to_remove = set()
- # Whenever we found a match, we need to drop everything
- # this is a greedy algorithm, it will match on the first found token
- reset = False
-
- # In this case, we already have partial matches (But unfinished)
- for start, trie_pointer in states.items():
- if "" in trie_pointer:
- # This is a final match, we need to reset and
- # store the results in `offsets`.
-
- # Lookahead to match longest first
- # Important in case of extra_id_1 vs extra_id_100
- lookahead_index = current
- end = current
- next_char = text[lookahead_index] if lookahead_index < len(text) else None
- while next_char in trie_pointer:
- trie_pointer = trie_pointer[next_char]
- lookahead_index += 1
- if "" in trie_pointer:
- end = lookahead_index
- skip = lookahead_index
-
- if lookahead_index == len(text):
- # End of string
- break
- next_char = text[lookahead_index]
- # End lookahead
-
- # Storing and resetting
- offsets.append(start)
- offsets.append(end)
- reset = True
- elif current_char in trie_pointer:
- # The current character being looked at has a match within the trie
- # update the pointer (it will be stored back into states later).
- trie_pointer = trie_pointer[current_char]
-
- # Storing back the new pointer into the states.
- # Partial matches got longer by one.
- states[start] = trie_pointer
- else:
- # The new character has not match in the trie, we need
- # to stop keeping track of this partial match.
- # We can't do it directly within the loop because of how
- # python iteration works
- to_remove.add(start)
-
- # Either clearing the full start (we found a real match)
- # Or clearing only the partial matches that didn't work.
- if reset:
- states = {}
- else:
- for start in to_remove:
- del states[start]
-
- # If this character is a starting character within the trie
- # start keeping track of this partial match.
- if current_char in self.data:
- states[current] = self.data[current_char]
-
- # We have a cut at the end with states.
- for start, trie_pointer in states.items():
- if "" in trie_pointer:
- # This is a final match, we need to reset and
- # store the results in `offsets`.
- end = len(text)
- offsets.append(start)
- offsets.append(end)
- # Longest cut is always the one with lower start so the first
- # item so we need to break.
- break
-
- # We have all the offsets now, we just need to do the actual splitting.
- # We need to eventually add the first part of the string and the eventual
- # last part.
- offsets.append(len(text))
- tokens = []
- start = 0
- for end in offsets:
- if start == end:
- # This might happen if there's a match at index 0
- # we're also preventing zero-width cuts in case of two
- # consecutive matches
- continue
- tokens.append(text[start:end])
- start = end
-
- return tokens
-
-
- def _is_whitespace(char):
- """Checks whether `char` is a whitespace character."""
- # \t, \n, and \r are technically control characters but we treat them
- # as whitespace since they are generally considered as such.
- if char == " " or char == "\t" or char == "\n" or char == "\r":
- return True
- cat = unicodedata.category(char)
- if cat == "Zs":
- return True
- return False
-
-
- def _is_control(char):
- """Checks whether `char` is a control character."""
- # These are technically control characters but we count them as whitespace
- # characters.
- if char == "\t" or char == "\n" or char == "\r":
- return False
- cat = unicodedata.category(char)
- if cat.startswith("C"):
- return True
- return False
-
-
- def _is_punctuation(char):
- """Checks whether `char` is a punctuation character."""
- cp = ord(char)
- # We treat all non-letter/number ASCII as punctuation.
- # Characters such as "^", "$", and "`" are not in the Unicode
- # Punctuation class but we treat them as punctuation anyways, for
- # consistency.
- if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):
- return True
- cat = unicodedata.category(char)
- if cat.startswith("P"):
- return True
- return False
-
-
- def _is_end_of_word(text):
- """Checks whether the last character in text is one of a punctuation, control or whitespace character."""
- last_char = text[-1]
- return bool(_is_control(last_char) | _is_punctuation(last_char) | _is_whitespace(last_char))
-
-
- def _is_start_of_word(text):
- """Checks whether the first character in text is one of a punctuation, control or whitespace character."""
- first_char = text[0]
- return bool(_is_control(first_char) | _is_punctuation(first_char) | _is_whitespace(first_char))
-
-
- def _insert_one_token_to_ordered_list(token_list: List[str], new_token: str):
- """
- Inserts one token to an ordered list if it does not already exist. Note: token_list must be sorted.
- """
- insertion_idx = bisect.bisect_left(token_list, new_token)
- # Checks if new_token is already in the ordered token_list
- if insertion_idx < len(token_list) and token_list[insertion_idx] == new_token:
- # new_token is in token_list, don't add
- return
- else:
- token_list.insert(insertion_idx, new_token)
-
-
- @add_end_docstrings(INIT_TOKENIZER_DOCSTRING)
- class PreTrainedTokenizer(PreTrainedTokenizerBase):
- """
- Base class for all slow tokenizers.
-
- Inherits from :class:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase`.
-
- Handle all the shared methods for tokenization and special tokens as well as methods downloading/caching/loading
- pretrained tokenizers as well as adding tokens to the vocabulary.
-
- This class also contain the added tokens in a unified way on top of all tokenizers so we don't have to handle the
- specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
- """
-
- def __init__(self, **kwargs):
- super().__init__(**kwargs)
-
- # Added tokens - We store this for both slow and fast tokenizers
- # until the serialization of Fast tokenizers is updated
- self.added_tokens_encoder: Dict[str, int] = {}
- self.added_tokens_decoder: Dict[int, str] = {}
- self.unique_no_split_tokens: List[str] = []
- self.tokens_trie = Trie()
-
- self._decode_use_source_tokenizer = False
-
- @property
- def is_fast(self) -> bool:
- return False
-
- @property
- def vocab_size(self) -> int:
- """
- :obj:`int`: Size of the base vocabulary (without the added tokens).
- """
- raise NotImplementedError
-
- def get_added_vocab(self) -> Dict[str, int]:
- """
- Returns the added tokens in the vocabulary as a dictionary of token to index.
-
- Returns:
- :obj:`Dict[str, int]`: The added tokens.
- """
- return self.added_tokens_encoder
-
- def __len__(self):
- """
- Size of the full vocabulary with the added tokens.
- """
- return self.vocab_size + len(self.added_tokens_encoder)
-
- def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
- """
- Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to
- it with indices starting from length of the current vocabulary.
-
- Args:
- new_tokens (:obj:`List[str]`or :obj:`List[tokenizers.AddedToken]`):
- Token(s) to add in vocabulary. A token is only added if it's not already in the vocabulary (tested by
- checking if the tokenizer assign the index of the ``unk_token`` to them).
- special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
- Whether or not the tokens should be added as special tokens.
-
- Returns:
- :obj:`int`: The number of tokens actually added to the vocabulary.
-
- Examples::
-
- # Let's see how to increase the vocabulary of Bert model and tokenizer
- tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
- model = BertModel.from_pretrained('bert-base-uncased')
-
- num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2'])
- print('We have added', num_added_toks, 'tokens')
- # Note: resize_token_embeddings expects to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
- model.resize_token_embeddings(len(tokenizer))
- """
- new_tokens = [str(tok) for tok in new_tokens]
-
- tokens_to_add = []
- for token in new_tokens:
- if not isinstance(token, str):
- raise TypeError(f"Token {token} is not a string but a {type(token)}.")
- if not special_tokens and hasattr(self, "do_lower_case") and self.do_lower_case:
- token = token.lower()
- if (
- token != self.unk_token
- and self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token)
- and token not in tokens_to_add
- ):
- tokens_to_add.append(token)
- if self.verbose:
- logger.info(f"Adding {token} to the vocabulary")
-
- added_tok_encoder = dict((tok, len(self) + i) for i, tok in enumerate(tokens_to_add))
- added_tok_decoder = {v: k for k, v in added_tok_encoder.items()}
- self.added_tokens_encoder.update(added_tok_encoder)
- self.added_tokens_decoder.update(added_tok_decoder)
-
- # Make sure we don't split on any special tokens (even they were already in the vocab before e.g. for Albert)
- if special_tokens:
- if len(new_tokens) == 1:
- _insert_one_token_to_ordered_list(self.unique_no_split_tokens, new_tokens[0])
- else:
- self.unique_no_split_tokens = sorted(set(self.unique_no_split_tokens).union(set(new_tokens)))
- else:
- # Or on the newly added tokens
- if len(tokens_to_add) == 1:
- _insert_one_token_to_ordered_list(self.unique_no_split_tokens, tokens_to_add[0])
- else:
- self.unique_no_split_tokens = sorted(set(self.unique_no_split_tokens).union(set(tokens_to_add)))
- self._create_trie(self.unique_no_split_tokens)
-
- return len(tokens_to_add)
-
- def _create_trie(self, unique_no_split_tokens):
- trie = Trie()
- for token in unique_no_split_tokens:
- if hasattr(self, "do_lower_case") and self.do_lower_case and token not in self.all_special_tokens:
- trie.add(token.lower())
- else:
- trie.add(token)
- self.tokens_trie = trie
-
- def num_special_tokens_to_add(self, pair: bool = False) -> int:
- """
- Returns the number of added tokens when encoding a sequence with special tokens.
-
- .. note::
- This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not
- put this inside your training loop.
-
- Args:
- pair (:obj:`bool`, `optional`, defaults to :obj:`False`):
- Whether the number of added tokens should be computed in the case of a sequence pair or a single
- sequence.
-
- Returns:
- :obj:`int`: Number of special tokens added to sequences.
- """
- token_ids_0 = []
- token_ids_1 = []
- return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None))
-
- def tokenize(self, text: TextInput, **kwargs) -> List[str]:
- """
- Converts a string in a sequence of tokens, using the tokenizer.
-
- Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies
- (BPE/SentencePieces/WordPieces). Takes care of added tokens.
-
- Args:
- text (:obj:`str`):
- The sequence to be encoded.
- **kwargs (additional keyword arguments):
- Passed along to the model-specific ``prepare_for_tokenization`` preprocessing method.
-
- Returns:
- :obj:`List[str]`: The list of tokens.
- """
- # Simple mapping string => AddedToken for special tokens with specific tokenization behaviors
- all_special_tokens_extended = dict(
- (str(t), t) for t in self.all_special_tokens_extended if isinstance(t, AddedToken)
- )
-
- text, kwargs = self.prepare_for_tokenization(text, **kwargs)
-
- if kwargs:
- logger.warning(f"Keyword arguments {kwargs} not recognized.")
-
- # TODO: should this be in the base class?
- if hasattr(self, "do_lower_case") and self.do_lower_case:
- # convert non-special tokens to lowercase
- escaped_special_toks = [
- re.escape(s_tok) for s_tok in (self.unique_no_split_tokens + self.all_special_tokens)
- ]
- pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)"
- text = re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), text)
-
- no_split_token = set(self.unique_no_split_tokens)
- tokens = self.tokens_trie.split(text)
- # ["This is something", "<special_token_1>", " else"]
- for i, token in enumerate(tokens):
- if token in no_split_token:
- tok_extended = all_special_tokens_extended.get(token, None)
- left = tokens[i - 1] if i > 0 else None
- right = tokens[i + 1] if i < len(tokens) - 1 else None
- if isinstance(tok_extended, AddedToken):
- if tok_extended.rstrip and right:
- # A bit counter-intuitive but we strip the left of the string
- # since tok_extended.rstrip means the special token is eating all white spaces on its right
- tokens[i + 1] = right.lstrip()
- # Strip white spaces on the left
- if tok_extended.lstrip and left:
- tokens[i - 1] = left.rstrip() # Opposite here
- else:
- # We strip left and right by default
- if right:
- tokens[i + 1] = right.lstrip()
- if left:
- tokens[i - 1] = left.rstrip()
- # ["This is something", "<special_token_1>", "else"]
- tokenized_text = []
- for token in tokens:
- # Need to skip eventual empty (fully stripped) tokens
- if not token:
- continue
- if token in no_split_token:
- tokenized_text.append(token)
- else:
- tokenized_text.extend(self._tokenize(token))
- # ["This", " is", " something", "<special_token_1>", "else"]
- return tokenized_text
-
- def _tokenize(self, text, **kwargs):
- """
- Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
- vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
-
- Do NOT take care of added tokens.
- """
- raise NotImplementedError
-
- def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
- """
- Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the
- vocabulary.
-
- Args:
- tokens (:obj:`str` or :obj:`List[str]`): One or several token(s) to convert to token id(s).
-
- Returns:
- :obj:`int` or :obj:`List[int]`: The token id or list of token ids.
- """
- if tokens is None:
- return None
-
- if isinstance(tokens, str):
- return self._convert_token_to_id_with_added_voc(tokens)
-
- ids = []
- for token in tokens:
- ids.append(self._convert_token_to_id_with_added_voc(token))
- return ids
-
- def _convert_token_to_id_with_added_voc(self, token):
- if token is None:
- return None
-
- if token in self.added_tokens_encoder:
- return self.added_tokens_encoder[token]
- return self._convert_token_to_id(token)
-
- def _convert_token_to_id(self, token):
- raise NotImplementedError
-
- def _encode_plus(
- self,
- text: Union[TextInput, PreTokenizedInput, EncodedInput],
- text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
- add_special_tokens: bool = True,
- padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
- truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
- max_length: Optional[int] = None,
- stride: int = 0,
- is_split_into_words: bool = False,
- pad_to_multiple_of: Optional[int] = None,
- return_tensors: Optional[Union[str, TensorType]] = None,
- return_token_type_ids: Optional[bool] = None,
- return_attention_mask: Optional[bool] = None,
- return_overflowing_tokens: bool = False,
- return_special_tokens_mask: bool = False,
- return_offsets_mapping: bool = False,
- return_length: bool = False,
- verbose: bool = True,
- **kwargs
- ) -> BatchEncoding:
- def get_input_ids(text):
- if isinstance(text, str):
- tokens = self.tokenize(text, **kwargs)
- return self.convert_tokens_to_ids(tokens)
- elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
- if is_split_into_words:
- tokens = list(
- itertools.chain(*(self.tokenize(t, is_split_into_words=True, **kwargs) for t in text))
- )
- return self.convert_tokens_to_ids(tokens)
- else:
- return self.convert_tokens_to_ids(text)
- elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
- return text
- else:
- if is_split_into_words:
- raise ValueError(
- f"Input {text} is not valid. Should be a string or a list/tuple of strings when `is_split_into_words=True`."
- )
- else:
- raise ValueError(
- f"Input {text} is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
- )
-
- if return_offsets_mapping:
- raise NotImplementedError(
- "return_offset_mapping is not available when using Python tokenizers."
- "To use this feature, change your tokenizer to one deriving from "
- "transformers.PreTrainedTokenizerFast."
- "More information on available tokenizers at "
- "https://github.com/huggingface/transformers/pull/2674"
- )
-
- first_ids = get_input_ids(text)
- second_ids = get_input_ids(text_pair) if text_pair is not None else None
-
- return self.prepare_for_model(
- first_ids,
- pair_ids=second_ids,
- add_special_tokens=add_special_tokens,
- padding=padding_strategy.value,
- truncation=truncation_strategy.value,
- max_length=max_length,
- stride=stride,
- pad_to_multiple_of=pad_to_multiple_of,
- return_tensors=return_tensors,
- prepend_batch_axis=True,
- return_attention_mask=return_attention_mask,
- return_token_type_ids=return_token_type_ids,
- return_overflowing_tokens=return_overflowing_tokens,
- return_special_tokens_mask=return_special_tokens_mask,
- return_length=return_length,
- verbose=verbose,
- )
-
- def _batch_encode_plus(
- self,
- batch_text_or_text_pairs: Union[
- List[TextInput],
- List[TextInputPair],
- List[PreTokenizedInput],
- List[PreTokenizedInputPair],
- List[EncodedInput],
- List[EncodedInputPair],
- ],
- add_special_tokens: bool = True,
- padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
- truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
- max_length: Optional[int] = None,
- stride: int = 0,
- is_split_into_words: bool = False,
- pad_to_multiple_of: Optional[int] = None,
- return_tensors: Optional[Union[str, TensorType]] = None,
- return_token_type_ids: Optional[bool] = None,
- return_attention_mask: Optional[bool] = None,
- return_overflowing_tokens: bool = False,
- return_special_tokens_mask: bool = False,
- return_offsets_mapping: bool = False,
- return_length: bool = False,
- verbose: bool = True,
- **kwargs
- ) -> BatchEncoding:
- def get_input_ids(text):
- if isinstance(text, str):
- tokens = self.tokenize(text, **kwargs)
- return self.convert_tokens_to_ids(tokens)
- elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
- if is_split_into_words:
- tokens = list(
- itertools.chain(*(self.tokenize(t, is_split_into_words=True, **kwargs) for t in text))
- )
- return self.convert_tokens_to_ids(tokens)
- else:
- return self.convert_tokens_to_ids(text)
- elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
- return text
- else:
- raise ValueError(
- "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
- )
-
- if return_offsets_mapping:
- raise NotImplementedError(
- "return_offset_mapping is not available when using Python tokenizers."
- "To use this feature, change your tokenizer to one deriving from "
- "transformers.PreTrainedTokenizerFast."
- )
-
- input_ids = []
- for ids_or_pair_ids in batch_text_or_text_pairs:
- if not isinstance(ids_or_pair_ids, (list, tuple)):
- ids, pair_ids = ids_or_pair_ids, None
- elif is_split_into_words and not isinstance(ids_or_pair_ids[0], (list, tuple)):
- ids, pair_ids = ids_or_pair_ids, None
- else:
- ids, pair_ids = ids_or_pair_ids
-
- first_ids = get_input_ids(ids)
- second_ids = get_input_ids(pair_ids) if pair_ids is not None else None
- input_ids.append((first_ids, second_ids))
-
- batch_outputs = self._batch_prepare_for_model(
- input_ids,
- add_special_tokens=add_special_tokens,
- padding_strategy=padding_strategy,
- truncation_strategy=truncation_strategy,
- max_length=max_length,
- stride=stride,
- pad_to_multiple_of=pad_to_multiple_of,
- return_attention_mask=return_attention_mask,
- return_token_type_ids=return_token_type_ids,
- return_overflowing_tokens=return_overflowing_tokens,
- return_special_tokens_mask=return_special_tokens_mask,
- return_length=return_length,
- return_tensors=return_tensors,
- verbose=verbose,
- )
-
- return BatchEncoding(batch_outputs)
-
- @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
- def _batch_prepare_for_model(
- self,
- batch_ids_pairs: List[Union[PreTokenizedInputPair, Tuple[List[int], None]]],
- add_special_tokens: bool = True,
- padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
- truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
- max_length: Optional[int] = None,
- stride: int = 0,
- pad_to_multiple_of: Optional[int] = None,
- return_tensors: Optional[str] = None,
- return_token_type_ids: Optional[bool] = None,
- return_attention_mask: Optional[bool] = None,
- return_overflowing_tokens: bool = False,
- return_special_tokens_mask: bool = False,
- return_length: bool = False,
- verbose: bool = True,
- ) -> BatchEncoding:
- """
- Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
- adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
- manages a moving window (with user defined stride) for overflowing tokens
-
- Args:
- batch_ids_pairs: list of tokenized input ids or input ids pairs
- """
-
- batch_outputs = {}
- for first_ids, second_ids in batch_ids_pairs:
- outputs = self.prepare_for_model(
- first_ids,
- second_ids,
- add_special_tokens=add_special_tokens,
- padding=PaddingStrategy.DO_NOT_PAD.value, # we pad in batch afterward
- truncation=truncation_strategy.value,
- max_length=max_length,
- stride=stride,
- pad_to_multiple_of=None, # we pad in batch afterward
- return_attention_mask=False, # we pad in batch afterward
- return_token_type_ids=return_token_type_ids,
- return_overflowing_tokens=return_overflowing_tokens,
- return_special_tokens_mask=return_special_tokens_mask,
- return_length=return_length,
- return_tensors=None, # We convert the whole batch to tensors at the end
- prepend_batch_axis=False,
- verbose=verbose,
- )
-
- for key, value in outputs.items():
- if key not in batch_outputs:
- batch_outputs[key] = []
- batch_outputs[key].append(value)
-
- batch_outputs = self.pad(
- batch_outputs,
- padding=padding_strategy.value,
- max_length=max_length,
- pad_to_multiple_of=pad_to_multiple_of,
- return_attention_mask=return_attention_mask,
- )
-
- batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)
-
- return batch_outputs
-
- def prepare_for_tokenization(
- self, text: str, is_split_into_words: bool = False, **kwargs
- ) -> Tuple[str, Dict[str, Any]]:
- """
- Performs any necessary transformations before tokenization.
-
- This method should pop the arguments from kwargs and return the remaining :obj:`kwargs` as well. We test the
- :obj:`kwargs` at the end of the encoding process to be sure all the arguments have been used.
-
- Args:
- text (:obj:`str`):
- The text to prepare.
- is_split_into_words (:obj:`bool`, `optional`, defaults to :obj:`False`):
- Whether or not the input is already pre-tokenized (e.g., split into words). If set to :obj:`True`, the
- tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
- which it will tokenize. This is useful for NER or token classification.
- kwargs:
- Keyword arguments to use for the tokenization.
-
- Returns:
- :obj:`Tuple[str, Dict[str, Any]]`: The prepared text and the unused kwargs.
- """
- return (text, kwargs)
-
- def get_special_tokens_mask(
- self, token_ids_0: List, token_ids_1: Optional[List] = None, already_has_special_tokens: bool = False
- ) -> List[int]:
- """
- Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
- special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
-
- Args:
- token_ids_0 (:obj:`List[int]`):
- List of ids of the first sequence.
- token_ids_1 (:obj:`List[int]`, `optional`):
- List of ids of the second sequence.
- already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
- Whether or not the token list is already formatted with special tokens for the model.
-
- Returns:
- A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
- """
- if already_has_special_tokens:
- if token_ids_1 is not None:
- raise ValueError(
- "You should not supply a second sequence if the provided sequence of "
- "ids is already formatted with special tokens for the model."
- )
-
- return super().get_special_tokens_mask(
- token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
- )
- return [0] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0))
-
- @overload
- def convert_ids_to_tokens(self, ids: int, skip_special_tokens: bool = False) -> str:
- ...
-
- @overload
- def convert_ids_to_tokens(self, ids: List[int], skip_special_tokens: bool = False) -> List[str]:
- ...
-
- def convert_ids_to_tokens(
- self, ids: Union[int, List[int]], skip_special_tokens: bool = False
- ) -> Union[str, List[str]]:
- """
- Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
- added tokens.
-
- Args:
- ids (:obj:`int` or :obj:`List[int]`):
- The token id (or token ids) to convert to tokens.
- skip_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
- Whether or not to remove special tokens in the decoding.
-
- Returns:
- :obj:`str` or :obj:`List[str]`: The decoded token(s).
- """
- if isinstance(ids, int):
- if ids in self.added_tokens_decoder:
- return self.added_tokens_decoder[ids]
- else:
- return self._convert_id_to_token(ids)
- tokens = []
- for index in ids:
- index = int(index)
- if skip_special_tokens and index in self.all_special_ids:
- continue
- if index in self.added_tokens_decoder:
- tokens.append(self.added_tokens_decoder[index])
- else:
- tokens.append(self._convert_id_to_token(index))
- return tokens
-
- def _convert_id_to_token(self, index: int) -> str:
- raise NotImplementedError
-
- def convert_tokens_to_string(self, tokens: List[str]) -> str:
- return " ".join(tokens)
-
- def _decode(
- self,
- token_ids: List[int],
- skip_special_tokens: bool = False,
- clean_up_tokenization_spaces: bool = True,
- spaces_between_special_tokens: bool = True,
- **kwargs
- ) -> str:
- self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False)
-
- filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
-
- # To avoid mixing byte-level and unicode for byte-level BPT
- # we need to build string separately for added tokens and byte-level tokens
- # cf. https://github.com/huggingface/transformers/issues/1133
- sub_texts = []
- current_sub_text = []
- for token in filtered_tokens:
- if skip_special_tokens and token in self.all_special_ids:
- continue
- if token in self.added_tokens_encoder:
- if current_sub_text:
- sub_texts.append(self.convert_tokens_to_string(current_sub_text))
- current_sub_text = []
- sub_texts.append(token)
- else:
- current_sub_text.append(token)
- if current_sub_text:
- sub_texts.append(self.convert_tokens_to_string(current_sub_text))
-
- if spaces_between_special_tokens:
- text = " ".join(sub_texts)
- else:
- text = "".join(sub_texts)
-
- if clean_up_tokenization_spaces:
- clean_text = self.clean_up_tokenization(text)
- return clean_text
- else:
- return text
|